summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
committertrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
commit68569dee1416593955c1570d638b3d9250b33012 (patch)
treed960f017cd7eba3f125b7e8a813789ee2e076310 /third_party/aom/aom_dsp
parent07c17b6b98ed32fcecff15c083ab0fd878de3cf0 (diff)
downloadUXP-68569dee1416593955c1570d638b3d9250b33012.tar
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.gz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.lz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.xz
UXP-68569dee1416593955c1570d638b3d9250b33012.zip
Import aom library
This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36.
Diffstat (limited to 'third_party/aom/aom_dsp')
-rw-r--r--third_party/aom/aom_dsp/add_noise.c73
-rw-r--r--third_party/aom/aom_dsp/ans.h44
-rw-r--r--third_party/aom/aom_dsp/ansreader.h214
-rw-r--r--third_party/aom/aom_dsp/answriter.h148
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.c854
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.h57
-rw-r--r--third_party/aom/aom_dsp/aom_dsp.cmake509
-rw-r--r--third_party/aom/aom_dsp/aom_dsp.mk428
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_common.h107
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_rtcd.c16
-rwxr-xr-xthird_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl1495
-rw-r--r--third_party/aom/aom_dsp/aom_filter.h43
-rw-r--r--third_party/aom/aom_dsp/aom_simd.h37
-rw-r--r--third_party/aom/aom_dsp/aom_simd_inline.h21
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c364
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm295
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon.c331
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm273
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c145
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm119
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c93
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm87
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_neon.c66
-rw-r--r--third_party/aom/aom_dsp/arm/avg_neon.c254
-rw-r--r--third_party/aom/aom_dsp/arm/bilinear_filter_media.asm240
-rw-r--r--third_party/aom/aom_dsp/arm/fwd_txfm_neon.c221
-rw-r--r--third_party/aom/aom_dsp/arm/hadamard_neon.c200
-rw-r--r--third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm201
-rw-r--r--third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c59
-rw-r--r--third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm1182
-rw-r--r--third_party/aom/aom_dsp/arm/idct16x16_add_neon.c1295
-rw-r--r--third_party/aom/aom_dsp/arm/idct16x16_neon.c152
-rw-r--r--third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm147
-rw-r--r--third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c141
-rw-r--r--third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm1302
-rw-r--r--third_party/aom/aom_dsp/arm/idct32x32_add_neon.c686
-rw-r--r--third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm71
-rw-r--r--third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c47
-rw-r--r--third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm193
-rw-r--r--third_party/aom/aom_dsp/arm/idct4x4_add_neon.c146
-rw-r--r--third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm91
-rw-r--r--third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c62
-rw-r--r--third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm522
-rw-r--r--third_party/aom/aom_dsp/arm/idct8x8_add_neon.c509
-rw-r--r--third_party/aom/aom_dsp/arm/intrapred_neon.c757
-rw-r--r--third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm633
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm202
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_16_neon.c174
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm252
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_4_neon.c250
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm428
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_8_neon.c430
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm638
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_neon.c49
-rw-r--r--third_party/aom/aom_dsp/arm/sad4d_neon.c225
-rw-r--r--third_party/aom/aom_dsp/arm/sad_media.asm98
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon.c224
-rw-r--r--third_party/aom/aom_dsp/arm/save_reg_neon.asm39
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_media.c81
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_neon.c134
-rw-r--r--third_party/aom/aom_dsp/arm/subtract_neon.c80
-rw-r--r--third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm185
-rw-r--r--third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm225
-rw-r--r--third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm187
-rw-r--r--third_party/aom/aom_dsp/arm/variance_media.asm361
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon.c400
-rw-r--r--third_party/aom/aom_dsp/avg.c232
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.c117
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.h38
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.c211
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.h70
-rw-r--r--third_party/aom/aom_dsp/bitreader.h276
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.c47
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.h48
-rw-r--r--third_party/aom/aom_dsp/bitwriter.h255
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.c61
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.h44
-rw-r--r--third_party/aom/aom_dsp/blend.h42
-rw-r--r--third_party/aom/aom_dsp/blend_a64_hmask.c71
-rw-r--r--third_party/aom/aom_dsp/blend_a64_mask.c145
-rw-r--r--third_party/aom/aom_dsp/blend_a64_vmask.c73
-rw-r--r--third_party/aom/aom_dsp/buf_ans.c71
-rw-r--r--third_party/aom/aom_dsp/buf_ans.h133
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.c37
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.h164
-rw-r--r--third_party/aom/aom_dsp/daalaboolwriter.c32
-rw-r--r--third_party/aom/aom_dsp/daalaboolwriter.h87
-rw-r--r--third_party/aom/aom_dsp/dkboolreader.c110
-rw-r--r--third_party/aom/aom_dsp/dkboolreader.h181
-rw-r--r--third_party/aom/aom_dsp/dkboolwriter.c44
-rw-r--r--third_party/aom/aom_dsp/dkboolwriter.h104
-rw-r--r--third_party/aom/aom_dsp/entcode.c53
-rw-r--r--third_party/aom/aom_dsp/entcode.h46
-rw-r--r--third_party/aom/aom_dsp/entdec.c300
-rw-r--r--third_party/aom/aom_dsp/entdec.h91
-rw-r--r--third_party/aom/aom_dsp/entenc.c507
-rw-r--r--third_party/aom/aom_dsp/entenc.h91
-rw-r--r--third_party/aom/aom_dsp/fastssim.c493
-rw-r--r--third_party/aom/aom_dsp/fwd_txfm.c809
-rw-r--r--third_party/aom/aom_dsp/fwd_txfm.h29
-rw-r--r--third_party/aom/aom_dsp/intrapred.c971
-rw-r--r--third_party/aom/aom_dsp/inv_txfm.c1445
-rw-r--r--third_party/aom/aom_dsp/inv_txfm.h91
-rw-r--r--third_party/aom/aom_dsp/loopfilter.c900
-rw-r--r--third_party/aom/aom_dsp/mips/add_noise_msa.c60
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c704
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c605
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c677
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c692
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_msa.c630
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c699
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c233
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c248
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_msa.h124
-rw-r--r--third_party/aom/aom_dsp/mips/avg_msa.c57
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.c31
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.h49
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c256
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c802
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_dspr2.c1030
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c681
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c237
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c641
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c998
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_dspr2.c1590
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c878
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c360
-rw-r--r--third_party/aom/aom_dsp/mips/convolve_common_dspr2.h59
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c948
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_txfm_msa.c246
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_txfm_msa.h381
-rw-r--r--third_party/aom/aom_dsp/mips/idct16x16_msa.c486
-rw-r--r--third_party/aom/aom_dsp/mips/idct32x32_msa.c730
-rw-r--r--third_party/aom/aom_dsp/mips/idct4x4_msa.c99
-rw-r--r--third_party/aom/aom_dsp/mips/idct8x8_msa.c117
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred16_dspr2.c325
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred4_dspr2.c225
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred8_dspr2.c603
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred_msa.c739
-rw-r--r--third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h80
-rw-r--r--third_party/aom/aom_dsp/mips/inv_txfm_msa.h412
-rw-r--r--third_party/aom/aom_dsp/mips/itrans16_dspr2.c1190
-rw-r--r--third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c1042
-rw-r--r--third_party/aom/aom_dsp/mips/itrans32_dspr2.c1030
-rw-r--r--third_party/aom/aom_dsp/mips/itrans4_dspr2.c342
-rw-r--r--third_party/aom/aom_dsp/mips/itrans8_dspr2.c645
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_16_msa.c1487
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_4_msa.c147
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_8_msa.c333
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c327
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h735
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h436
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h356
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c589
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c734
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c757
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_msa.h251
-rw-r--r--third_party/aom/aom_dsp/mips/macros_msa.h2057
-rw-r--r--third_party/aom/aom_dsp/mips/sad_msa.c1529
-rw-r--r--third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c1795
-rw-r--r--third_party/aom/aom_dsp/mips/subtract_msa.c265
-rw-r--r--third_party/aom/aom_dsp/mips/txfm_macros_msa.h97
-rw-r--r--third_party/aom/aom_dsp/mips/variance_msa.c632
-rw-r--r--third_party/aom/aom_dsp/postproc.h26
-rw-r--r--third_party/aom/aom_dsp/prob.c236
-rw-r--r--third_party/aom/aom_dsp/prob.h198
-rw-r--r--third_party/aom/aom_dsp/psnr.c373
-rw-r--r--third_party/aom/aom_dsp/psnr.h79
-rw-r--r--third_party/aom/aom_dsp/psnrhvs.c276
-rw-r--r--third_party/aom/aom_dsp/quantize.c832
-rw-r--r--third_party/aom/aom_dsp/quantize.h120
-rw-r--r--third_party/aom/aom_dsp/sad.c512
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics.h268
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h671
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_c.h707
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h511
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics.h283
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h17
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_c.h724
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h545
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h548
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics.h223
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h583
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_c.h919
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h470
-rw-r--r--third_party/aom/aom_dsp/ssim.c462
-rw-r--r--third_party/aom/aom_dsp/ssim.h88
-rw-r--r--third_party/aom/aom_dsp/subtract.c55
-rw-r--r--third_party/aom/aom_dsp/sum_squares.c40
-rw-r--r--third_party/aom/aom_dsp/txfm_common.h70
-rw-r--r--third_party/aom/aom_dsp/variance.c1249
-rw-r--r--third_party/aom/aom_dsp/variance.h132
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c182
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm345
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm965
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm497
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c575
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c920
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm990
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm883
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm451
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm421
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse2.c426
-rw-r--r--third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm124
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c36
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c924
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c285
-rw-r--r--third_party/aom/aom_dsp/x86/blend_sse4.h146
-rw-r--r--third_party/aom/aom_dsp/x86/convolve.h288
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c862
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h3022
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h3201
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c24
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h35
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h1014
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c273
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h362
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm204
-rw-r--r--third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm349
-rw-r--r--third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c77
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c1151
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm456
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c1140
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c155
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm290
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm366
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm1040
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c364
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm316
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c695
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse4.c216
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.asm771
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.asm410
-rw-r--r--third_party/aom/aom_dsp/x86/inv_txfm_sse2.c3631
-rw-r--r--third_party/aom/aom_dsp/x86/inv_txfm_sse2.h265
-rw-r--r--third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c1333
-rw-r--r--third_party/aom/aom_dsp/x86/inv_wht_sse2.asm112
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_avx2.c915
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c1892
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c334
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c1948
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_sse4.c262
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c355
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm547
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c249
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm349
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_avx2.c216
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_sse2.asm253
-rw-r--r--third_party/aom/aom_dsp/x86/sad_avx2.c187
-rw-r--r--third_party/aom/aom_dsp/x86/sad_highbd_avx2.c1043
-rw-r--r--third_party/aom/aom_dsp/x86/sad_impl_avx2.c233
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse2.asm345
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse3.asm377
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse4.asm362
-rw-r--r--third_party/aom/aom_dsp/x86/sad_ssse3.asm373
-rw-r--r--third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm219
-rw-r--r--third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm1489
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_sse2.asm150
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.c210
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h120
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h204
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_intrin.h31
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_sse2.h326
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c192
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_avx2.c713
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c690
266 files changed, 119012 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
new file mode 100644
index 000000000..389cf2049
--- /dev/null
+++ b/third_party/aom/aom_dsp/add_noise.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
+ char whiteclamp[16], char bothclamp[16],
+ unsigned int width, unsigned int height, int pitch) {
+ unsigned int i, j;
+
+ for (i = 0; i < height; ++i) {
+ uint8_t *pos = start + i * pitch;
+ char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
+
+ for (j = 0; j < width; ++j) {
+ int v = pos[j];
+
+ v = clamp(v - blackclamp[0], 0, 255);
+ v = clamp(v + bothclamp[0], 0, 255);
+ v = clamp(v - whiteclamp[0], 0, 255);
+
+ pos[j] = v + ref[j];
+ }
+ }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+ return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+ (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int aom_setup_noise(double sigma, int size, char *noise) {
+ char char_dist[256];
+ int next = 0, i, j;
+
+ // set up a 256 entry lookup that matches gaussian distribution
+ for (i = -32; i < 32; ++i) {
+ const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+ if (a_i) {
+ for (j = 0; j < a_i; ++j) {
+ char_dist[next + j] = (char)i;
+ }
+ next = next + j;
+ }
+ }
+
+ // Rounding error - might mean we have less than 256.
+ for (; next < 256; ++next) {
+ char_dist[next] = 0;
+ }
+
+ for (i = 0; i < size; ++i) {
+ noise[i] = char_dist[rand() & 0xff]; // NOLINT
+ }
+
+ // Returns the highest non 0 value used in distribution.
+ return -char_dist[0];
+}
diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h
new file mode 100644
index 000000000..a7a2f0eab
--- /dev/null
+++ b/third_party/aom/aom_dsp/ans.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANS_H_
+#define AOM_DSP_ANS_H_
+// Constants, types and utilities for Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Use windowed ANS, size is passed in at initialization
+#define ANS_MAX_SYMBOLS 1
+#define ANS_REVERSE 1
+
+typedef uint8_t AnsP8;
+#define ANS_P8_PRECISION 256u
+#define ANS_P8_SHIFT 8
+#define RANS_PROB_BITS 15
+#define RANS_PRECISION (1u << RANS_PROB_BITS)
+
+// L_BASE is the ANS base state. L_BASE % PRECISION must be 0.
+#define L_BASE (1u << 17)
+#define IO_BASE 256
+// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_DSP_ANS_H_
diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h
new file mode 100644
index 000000000..e50c63b2d
--- /dev/null
+++ b/third_party/aom/aom_dsp/ansreader.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANSREADER_H_
+#define AOM_DSP_ANSREADER_H_
+// An implementation of Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+// Implements decoding of:
+// * rABS (range Asymmetric Binary Systems), a boolean coder
+// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+#include "aom_dsp/ans.h"
+#include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct AnsDecoder {
+ const uint8_t *buf;
+ int buf_offset;
+ uint32_t state;
+#if ANS_MAX_SYMBOLS
+ int symbols_left;
+ int window_size;
+#endif
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+};
+
+static INLINE int ans_read_reinit(struct AnsDecoder *const ans);
+
+static INLINE unsigned refill_state(struct AnsDecoder *const ans,
+ unsigned state) {
+#if ANS_REVERSE
+ while (state < L_BASE && ans->buf_offset < 0) {
+ state = state * IO_BASE + ans->buf[ans->buf_offset++];
+ }
+#else
+ while (state < L_BASE && ans->buf_offset > 0) {
+ state = state * IO_BASE + ans->buf[--ans->buf_offset];
+ }
+#endif
+ return state;
+}
+
+// Decode one rABS encoded boolean where the probability of the value being zero
+// is p0.
+static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) {
+#if ANS_MAX_SYMBOLS
+ if (ans->symbols_left-- == 0) {
+ ans_read_reinit(ans);
+ ans->symbols_left--;
+ }
+#endif
+ unsigned state = refill_state(ans, ans->state);
+ const unsigned quotient = state / ANS_P8_PRECISION;
+ const unsigned remainder = state % ANS_P8_PRECISION;
+ const int value = remainder >= p0;
+ const unsigned qp0 = quotient * p0;
+ if (value)
+ state = state - qp0 - p0;
+ else
+ state = qp0 + remainder;
+ ans->state = state;
+ return value;
+}
+
+// Decode one rABS encoded boolean where the probability of the value being zero
+// is one half.
+static INLINE int rabs_read_bit(struct AnsDecoder *ans) {
+#if ANS_MAX_SYMBOLS
+ if (ans->symbols_left-- == 0) {
+ ans_read_reinit(ans);
+ ans->symbols_left--;
+ }
+#endif
+ unsigned state = refill_state(ans, ans->state);
+ const int value = !!(state & 0x80);
+ ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F);
+ return value;
+}
+
+struct rans_dec_sym {
+ uint8_t val;
+ aom_cdf_prob prob;
+ aom_cdf_prob cum_prob; // not-inclusive
+};
+
+static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
+ aom_cdf_prob rem) {
+ int i;
+ aom_cdf_prob cum_prob = 0, top_prob;
+ // TODO(skal): if critical, could be a binary search.
+ // Or, better, an O(1) alias-table.
+ for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
+ cum_prob = top_prob;
+ }
+ out->val = i;
+ out->prob = top_prob - cum_prob;
+ out->cum_prob = cum_prob;
+}
+
+static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
+ unsigned rem;
+ unsigned quo;
+ struct rans_dec_sym sym;
+#if ANS_MAX_SYMBOLS
+ if (ans->symbols_left-- == 0) {
+ ans_read_reinit(ans);
+ ans->symbols_left--;
+ }
+#endif
+ ans->state = refill_state(ans, ans->state);
+ quo = ans->state / RANS_PRECISION;
+ rem = ans->state % RANS_PRECISION;
+ fetch_sym(&sym, tab, rem);
+ ans->state = quo * sym.prob + rem - sym.cum_prob;
+ return sym.val;
+}
+
+static INLINE int ans_read_init(struct AnsDecoder *const ans,
+ const uint8_t *const buf, int offset) {
+ unsigned x;
+ if (offset < 1) return 1;
+#if ANS_REVERSE
+ ans->buf = buf + offset;
+ ans->buf_offset = -offset;
+ x = buf[0];
+ if ((x & 0x80) == 0) { // Marker is 0xxx xxxx
+ if (offset < 2) return 1;
+ ans->buf_offset += 2;
+ ans->state = mem_get_be16(buf) & 0x7FFF;
+#if L_BASE * IO_BASE > (1 << 23)
+ } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx
+ if (offset < 3) return 1;
+ ans->buf_offset += 3;
+ ans->state = mem_get_be24(buf) & 0x3FFFFF;
+ } else { // Marker is 11xx xxxx
+ if (offset < 4) return 1;
+ ans->buf_offset += 4;
+ ans->state = mem_get_be32(buf) & 0x3FFFFFFF;
+#else
+ } else { // Marker is 1xxx xxxx
+ if (offset < 3) return 1;
+ ans->buf_offset += 3;
+ ans->state = mem_get_be24(buf) & 0x7FFFFF;
+#endif
+ }
+#else
+ ans->buf = buf;
+ x = buf[offset - 1];
+ if ((x & 0x80) == 0) { // Marker is 0xxx xxxx
+ if (offset < 2) return 1;
+ ans->buf_offset = offset - 2;
+ ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF;
+ } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx
+ if (offset < 3) return 1;
+ ans->buf_offset = offset - 3;
+ ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
+ } else if ((x & 0xE0) == 0xE0) { // Marker is 111x xxxx
+ if (offset < 4) return 1;
+ ans->buf_offset = offset - 4;
+ ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
+ } else {
+ // Marker 110x xxxx implies this byte is a superframe marker
+ return 1;
+ }
+#endif // ANS_REVERSE
+#if CONFIG_ACCOUNTING
+ ans->accounting = NULL;
+#endif
+ ans->state += L_BASE;
+ if (ans->state >= L_BASE * IO_BASE) return 1;
+#if ANS_MAX_SYMBOLS
+ assert(ans->window_size > 1);
+ ans->symbols_left = ans->window_size;
+#endif
+ return 0;
+}
+
+#if ANS_REVERSE
+static INLINE int ans_read_reinit(struct AnsDecoder *const ans) {
+ return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset);
+}
+#endif
+
+static INLINE int ans_read_end(const struct AnsDecoder *const ans) {
+ return ans->buf_offset == 0 && ans->state < L_BASE;
+}
+
+static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
+ return ans->state < L_BASE / RANS_PRECISION;
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_DSP_ANSREADER_H_
diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h
new file mode 100644
index 000000000..353acf1a9
--- /dev/null
+++ b/third_party/aom/aom_dsp/answriter.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANSWRITER_H_
+#define AOM_DSP_ANSWRITER_H_
+// An implementation of Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+// Implements encoding of:
+// * rABS (range Asymmetric Binary Systems), a boolean coder
+// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/ans.h"
+#include "aom_dsp/prob.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/odintrin.h"
+
+#if RANS_PRECISION <= OD_DIVU_DMAX
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+ do { \
+ quotient = OD_DIVU_SMALL((dividend), (divisor)); \
+ remainder = (dividend) - (quotient) * (divisor); \
+ } while (0)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+ do { \
+ quotient = (dividend) / (divisor); \
+ remainder = (dividend) % (divisor); \
+ } while (0)
+#endif
+
+#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct AnsCoder {
+ uint8_t *buf;
+ int buf_offset;
+ uint32_t state;
+};
+
+static INLINE void ans_write_init(struct AnsCoder *const ans,
+ uint8_t *const buf) {
+ ans->buf = buf;
+ ans->buf_offset = 0;
+ ans->state = L_BASE;
+}
+
+static INLINE int ans_write_end(struct AnsCoder *const ans) {
+ uint32_t state;
+ int ans_size;
+ assert(ans->state >= L_BASE);
+ assert(ans->state < L_BASE * IO_BASE);
+ state = ans->state - L_BASE;
+ if (state < (1u << 15)) {
+ mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state);
+ ans_size = ans->buf_offset + 2;
+#if ANS_REVERSE
+#if L_BASE * IO_BASE > (1 << 23)
+ } else if (state < (1u << 22)) {
+ mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
+ ans_size = ans->buf_offset + 3;
+ } else if (state < (1u << 30)) {
+ mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state);
+ ans_size = ans->buf_offset + 4;
+#else
+ } else if (state < (1u << 23)) {
+ mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state);
+ ans_size = ans->buf_offset + 3;
+#endif
+#else
+ } else if (state < (1u << 22)) {
+ mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
+ ans_size = ans->buf_offset + 3;
+ } else if (state < (1u << 29)) {
+ mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state);
+ ans_size = ans->buf_offset + 4;
+#endif
+ } else {
+ assert(0 && "State is too large to be serialized");
+ return ans->buf_offset;
+ }
+#if ANS_REVERSE
+ {
+ int i;
+ uint8_t tmp;
+ for (i = 0; i < (ans_size >> 1); i++) {
+ tmp = ans->buf[i];
+ ans->buf[i] = ans->buf[ans_size - 1 - i];
+ ans->buf[ans_size - 1 - i] = tmp;
+ }
+ ans->buf += ans_size;
+ ans->buf_offset = 0;
+ ans->state = L_BASE;
+ }
+#endif
+ return ans_size;
+}
+
+// Write one boolean using rABS where p0 is the probability of the value being
+// zero.
+static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) {
+ const AnsP8 p = ANS_P8_PRECISION - p0;
+ const unsigned l_s = value ? p : p0;
+ unsigned state = ans->state;
+ while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
+ ans->buf[ans->buf_offset++] = state % IO_BASE;
+ state /= IO_BASE;
+ }
+ const unsigned quotient = ANS_DIV8(state, l_s);
+ const unsigned remainder = state - quotient * l_s;
+ ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0);
+}
+
+// Encode one symbol using rANS.
+// cum_prob: The cumulative probability before this symbol (the offset of
+// the symbol in the symbol cycle)
+// prob: The probability of this symbol (l_s from the paper)
+// RANS_PRECISION takes the place of m from the paper.
+static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob,
+ aom_cdf_prob prob) {
+ unsigned quotient, remainder;
+ while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) {
+ ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
+ ans->state /= IO_BASE;
+ }
+ ANS_DIVREM(quotient, remainder, ans->state, prob);
+ ans->state = quotient * RANS_PRECISION + remainder + cum_prob;
+}
+
+#undef ANS_DIV8
+#undef ANS_DIVREM
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_DSP_ANSWRITER_H_
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
new file mode 100644
index 000000000..74f4c00fb
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -0,0 +1,854 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+ 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ int x0_q4, int x_step_q4,
+ const InterpKernel *const y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+ MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+ intermediate_height);
+ convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
+ dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ w, h);
+}
+
+void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+ w, h);
+}
+
+void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h);
+}
+
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+
+ aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
+ h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int filter_x_stride, const int16_t *filter_y,
+ int filter_y_stride, int w, int h) {
+ int r;
+
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int filter_x_stride, const int16_t *filter_y,
+ int filter_y_stride, int w, int h) {
+ int x, y;
+
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+ src_x[SUBPEL_TAPS / 2 - 1]);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] =
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+ src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+ intermediate_height);
+ convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+ dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+}
+#endif // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h, int bd) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
+ x_step_q4, w, intermediate_height, bd);
+ highbd_convolve_vert(
+ CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+
+ aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+ aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
+ dst_stride, NULL, 0, NULL, 0, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h, int bd) {
+ int r;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w * sizeof(uint16_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void highbd_convolve_add_src_horiz(const uint8_t *src8,
+ ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4, int w,
+ int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
+ bd);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_add_src_vert(const uint8_t *src8,
+ ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h,
+ int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+ src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
+ bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ int x0_q4, int x_step_q4,
+ const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h,
+ int bd) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, CONVERT_TO_BYTEPTR(temp),
+ MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+ intermediate_height, bd);
+ highbd_convolve_add_src_vert(
+ CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_horiz_c(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+#endif // CONFIG_LOOP_RESTORATION
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h
new file mode 100644
index 000000000..d0de6c5d2
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_convolve.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_DSP_AOM_CONVOLVE_H_
+#define AOM_DSP_AOM_CONVOLVE_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+// (1) Interpolate horizontally into an intermediate buffer, temp.
+// (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+// original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_EXT_SIZE 263
+#else
+#define MAX_EXT_SIZE 135
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_AOM_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
new file mode 100644
index 000000000..f00348cbc
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -0,0 +1,509 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+set(AOM_DSP_COMMON_SOURCES
+ "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+ "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+ "${AOM_ROOT}/aom_dsp/aom_filter.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+ "${AOM_ROOT}/aom_dsp/blend.h"
+ "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+ "${AOM_ROOT}/aom_dsp/intrapred.c"
+ "${AOM_ROOT}/aom_dsp/loopfilter.c"
+ "${AOM_ROOT}/aom_dsp/prob.c"
+ "${AOM_ROOT}/aom_dsp/prob.h"
+ "${AOM_ROOT}/aom_dsp/sad.c"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/subtract.c"
+ "${AOM_ROOT}/aom_dsp/txfm_common.h"
+ "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h")
+
+set(AOM_DSP_COMMON_ASM_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm")
+
+set(AOM_DSP_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+ "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+ "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")
+
+set(AOM_DSP_COMMON_ASM_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm")
+
+set(AOM_DSP_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
+
+set(AOM_DSP_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+
+set(AOM_DSP_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
+
+set(AOM_DSP_COMMON_ASM_NEON
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm"
+ "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")
+
+set(AOM_DSP_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
+
+if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
+ set(AOM_DSP_COMMON_INTRIN_NEON
+ ${AOM_DSP_COMMON_INTRIN_NEON}
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+endif ()
+
+set(AOM_DSP_COMMON_INTRIN_DSPR2
+ "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+
+set(AOM_DSP_COMMON_INTRIN_MSA
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
+ "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h"
+ "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h"
+ "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
+ "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")
+
+if (CONFIG_HIGHBITDEPTH)
+ set(AOM_DSP_COMMON_ASM_SSE2
+ ${AOM_DSP_COMMON_ASM_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm")
+
+ set(AOM_DSP_COMMON_INTRIN_SSE2
+ ${AOM_DSP_COMMON_INTRIN_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+
+ set(AOM_DSP_COMMON_INTRIN_AVX2
+ ${AOM_DSP_COMMON_INTRIN_AVX2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c")
+else ()
+ set(AOM_DSP_COMMON_INTRIN_DSPR2
+ ${AOM_DSP_COMMON_INTRIN_DSPR2}
+ "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c"
+ "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c")
+endif ()
+
+if (CONFIG_ANS)
+ set(AOM_DSP_COMMON_SOURCES
+ ${AOM_DSP_COMMON_SOURCES}
+ "${AOM_ROOT}/aom_dsp/ans.h")
+elseif (CONFIG_DAALA_EC)
+ set(AOM_DSP_COMMON_SOURCES
+ ${AOM_DSP_COMMON_SOURCES}
+ "${AOM_ROOT}/aom_dsp/entcode.c"
+ "${AOM_ROOT}/aom_dsp/entcode.h")
+endif ()
+
+if (CONFIG_AV1)
+ set(AOM_DSP_COMMON_SOURCES
+ ${AOM_DSP_COMMON_SOURCES}
+ "${AOM_ROOT}/aom_dsp/inv_txfm.c"
+ "${AOM_ROOT}/aom_dsp/inv_txfm.h")
+
+ set(AOM_DSP_COMMON_ASM_SSE2
+ ${AOM_DSP_COMMON_ASM_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+ set(AOM_DSP_COMMON_INTRIN_SSE2
+ ${AOM_DSP_COMMON_INTRIN_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h")
+endif ()
+
+if (CONFIG_DECODERS)
+ set(AOM_DSP_DECODER_SOURCES
+ "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+ "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+ "${AOM_ROOT}/aom_dsp/bitreader.h"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.h")
+
+ if (CONFIG_ANS)
+ set(AOM_DSP_DECODER_SOURCES
+ ${AOM_DSP_DECODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/ansreader.h")
+ elseif (CONFIG_DAALA_EC)
+ set(AOM_DSP_DECODER_SOURCES
+ ${AOM_DSP_DECODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
+ "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
+ "${AOM_ROOT}/aom_dsp/entdec.c"
+ "${AOM_ROOT}/aom_dsp/entdec.h")
+ else ()
+ set(AOM_DSP_DECODER_SOURCES
+ ${AOM_DSP_DECODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/dkboolreader.c"
+ "${AOM_ROOT}/aom_dsp/dkboolreader.h")
+ endif ()
+endif ()
+
+if (CONFIG_ENCODERS)
+ set(AOM_DSP_ENCODER_SOURCES
+ "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+ "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+ "${AOM_ROOT}/aom_dsp/psnr.c"
+ "${AOM_ROOT}/aom_dsp/psnr.h"
+ "${AOM_ROOT}/aom_dsp/variance.c"
+ "${AOM_ROOT}/aom_dsp/variance.h")
+
+ set(AOM_DSP_ENCODER_ASM_SSE2
+ ${AOM_DSP_ENCODER_ASM_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c")
+
+ set(AOM_DSP_ENCODER_ASM_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm")
+
+ set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+ "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm")
+ set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm")
+
+ set(AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
+
+ if (CONFIG_AV1_ENCODER)
+ set(AOM_DSP_ENCODER_SOURCES
+ ${AOM_DSP_ENCODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/avg.c"
+ "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+ "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
+ "${AOM_ROOT}/aom_dsp/quantize.c"
+ "${AOM_ROOT}/aom_dsp/quantize.h"
+ "${AOM_ROOT}/aom_dsp/sum_squares.c")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSE2
+ ${AOM_DSP_ENCODER_INTRIN_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSSE3
+ ${AOM_DSP_ENCODER_INTRIN_SSSE3}
+ "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
+
+ set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+ ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}
+ "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+ set(AOM_DSP_ENCODER_AVX_ASM_X86_64
+ ${AOM_DSP_ENCODER_AVX_ASM_X86_64}
+ "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+
+ set(AOM_DSP_ENCODER_INTRIN_MSA
+ "${AOM_ROOT}/aom_dsp/mips/avg_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
+ "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
+
+ if (CONFIG_HIGHBITDEPTH)
+ set(AOM_DSP_ENCODER_INTRIN_SSE2
+ ${AOM_DSP_ENCODER_INTRIN_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c")
+ endif ()
+ endif ()
+
+ if (CONFIG_HIGHBITDEPTH)
+ set(AOM_DSP_ENCODER_ASM_SSE2
+ ${AOM_DSP_ENCODER_ASM_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSE2
+ ${AOM_DSP_ENCODER_INTRIN_SSE2}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+
+ set(AOM_DSP_ENCODER_INTRIN_SSE4_1
+ ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+
+ set(AOM_DSP_ENCODER_INTRIN_AVX2
+ ${AOM_DSP_ENCODER_INTRIN_AVX2}
+ "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c")
+ endif ()
+
+ if (CONFIG_ANS)
+ set(AOM_DSP_ENCODER_SOURCES
+ ${AOM_DSP_ENCODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/answriter.h"
+ "${AOM_ROOT}/aom_dsp/buf_ans.c"
+ "${AOM_ROOT}/aom_dsp/buf_ans.h")
+ elseif (CONFIG_DAALA_EC)
+ set(AOM_DSP_ENCODER_SOURCES
+ ${AOM_DSP_ENCODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
+ "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
+ "${AOM_ROOT}/aom_dsp/entenc.c"
+ "${AOM_ROOT}/aom_dsp/entenc.h")
+ else ()
+ set(AOM_DSP_ENCODER_SOURCES
+ ${AOM_DSP_ENCODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
+ "${AOM_ROOT}/aom_dsp/dkboolwriter.h")
+ endif ()
+
+ if (CONFIG_INTERNAL_STATS)
+ set(AOM_DSP_ENCODER_SOURCES
+ ${AOM_DSP_ENCODER_SOURCES}
+ "${AOM_ROOT}/aom_dsp/fastssim.c"
+ "${AOM_ROOT}/aom_dsp/psnrhvs.c"
+ "${AOM_ROOT}/aom_dsp/ssim.c"
+ "${AOM_ROOT}/aom_dsp/ssim.h")
+ endif ()
+endif ()
+
+if (CONFIG_MOTION_VAR)
+ set(AOM_DSP_ENCODER_INTRIN_SSE4_1
+ ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+endif ()
+
+# Creates aom_dsp build targets. Must not be called until after libaom target
+# has been created.
+function (setup_aom_dsp_targets)
+ add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_common)
+ target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_dsp_common>)
+
+ if (CONFIG_DECODERS)
+ add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder)
+ target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_dsp_decoder>)
+ endif ()
+
+ if (CONFIG_ENCODERS)
+ add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder)
+ target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_dsp_encoder>)
+ endif ()
+
+ if (HAVE_SSE2)
+ add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSE2")
+ if (CONFIG_ENCODERS)
+ add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSE2")
+ endif()
+ endif ()
+
+ if (HAVE_SSE3 AND CONFIG_ENCODERS)
+ add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom")
+ endif ()
+
+ if (HAVE_SSSE3)
+ add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSSE3")
+
+ if (CONFIG_ENCODERS)
+ if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+ ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+ endif ()
+ add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSSE3")
+ endif ()
+ endif ()
+
+ if (HAVE_SSE4_1)
+ add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSE4_1")
+ if (CONFIG_ENCODERS)
+ if (AOM_DSP_ENCODER_INTRIN_SSE4_1)
+ add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSE4_1")
+ endif ()
+ add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1"
+ "aom")
+ endif ()
+ endif ()
+
+ if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
+ "aom")
+ endif ()
+
+ if (HAVE_AVX2)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_AVX2")
+ if (CONFIG_ENCODERS)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_AVX2")
+ endif ()
+ endif ()
+
+ if (HAVE_NEON_ASM)
+ if (AOM_ADS2GAS_REQUIRED)
+ add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
+ else ()
+ add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
+ endif ()
+ endif ()
+
+ if (HAVE_NEON)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
+ endif ()
+
+ if (HAVE_DSPR2)
+ add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_DSPR2")
+ endif ()
+
+ if (HAVE_MSA)
+ add_intrinsics_object_library("" "msa" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_MSA")
+ if (CONFIG_ENCODERS)
+ add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_MSA")
+ endif ()
+ endif ()
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction ()
diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk
new file mode 100644
index 000000000..8c7241b83
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp.mk
@@ -0,0 +1,428 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+
+DSP_SRCS-yes += aom_dsp.mk
+DSP_SRCS-yes += aom_dsp_common.h
+
+DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h
+
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+DSP_SRCS-$(CONFIG_ANS) += ans.h
+
+ifeq ($(CONFIG_ENCODERS),yes)
+ifeq ($(CONFIG_ANS),yes)
+DSP_SRCS-yes += answriter.h
+DSP_SRCS-yes += buf_ans.h
+DSP_SRCS-yes += buf_ans.c
+else ifeq ($(CONFIG_DAALA_EC),yes)
+DSP_SRCS-yes += entenc.c
+DSP_SRCS-yes += entenc.h
+DSP_SRCS-yes += daalaboolwriter.c
+DSP_SRCS-yes += daalaboolwriter.h
+else
+DSP_SRCS-yes += dkboolwriter.h
+DSP_SRCS-yes += dkboolwriter.c
+endif
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += binary_codes_writer.c
+DSP_SRCS-yes += binary_codes_writer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+ifeq ($(CONFIG_ANS),yes)
+DSP_SRCS-yes += ansreader.h
+else ifeq ($(CONFIG_DAALA_EC),yes)
+DSP_SRCS-yes += entdec.c
+DSP_SRCS-yes += entdec.h
+DSP_SRCS-yes += daalaboolreader.c
+DSP_SRCS-yes += daalaboolreader.h
+else
+DSP_SRCS-yes += dkboolreader.h
+DSP_SRCS-yes += dkboolreader.c
+endif
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+DSP_SRCS-yes += binary_codes_reader.c
+DSP_SRCS-yes += binary_codes_reader.h
+endif
+
+# intra predictions
+DSP_SRCS-yes += intrapred.c
+
+ifeq ($(CONFIG_DAALA_EC),yes)
+DSP_SRCS-yes += entcode.c
+DSP_SRCS-yes += entcode.h
+endif
+
+DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
+
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+endif # CONFIG_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
+
+DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
+
+# inter predictions
+DSP_SRCS-yes += blend.h
+DSP_SRCS-yes += blend_a64_mask.c
+DSP_SRCS-yes += blend_a64_hmask.c
+DSP_SRCS-yes += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
+
+# interpolation filters
+DSP_SRCS-yes += aom_convolve.c
+DSP_SRCS-yes += aom_convolve.h
+DSP_SRCS-yes += aom_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/aom_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
+endif
+DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_neon.c
+DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
+
+# common (dspr2)
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
+
+DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/loopfilter_16_neon.c
+DSP_SRCS-yes += arm/loopfilter_8_neon.c
+DSP_SRCS-yes += arm/loopfilter_4_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c
+
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c
+endif # CONFIG_HIGHBITDEPTH
+
+DSP_SRCS-yes += txfm_common.h
+DSP_SRCS-yes += x86/txfm_common_intrin.h
+DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h
+DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
+
+# forward transform
+ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),)
+DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+DSP_SRCS-yes += fwd_txfm.c
+DSP_SRCS-yes += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.h
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
+endif # CONFIG_AV1_ENCODER
+endif # CONFIG_AV1
+
+# inverse transform
+ifeq ($(CONFIG_AV1), yes)
+DSP_SRCS-yes += inv_txfm.h
+DSP_SRCS-yes += inv_txfm.c
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/save_reg_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon.c
+DSP_SRCS-yes += arm/idct4x4_add_neon.c
+DSP_SRCS-yes += arm/idct8x8_1_add_neon.c
+DSP_SRCS-yes += arm/idct8x8_add_neon.c
+DSP_SRCS-yes += arm/idct16x16_1_add_neon.c
+DSP_SRCS-yes += arm/idct16x16_add_neon.c
+DSP_SRCS-yes += arm/idct32x32_1_add_neon.c
+DSP_SRCS-yes += arm/idct32x32_add_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
+
+DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c
+
+ifneq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+endif # CONFIG_HIGHBITDEPTH
+endif # CONFIG_AV1
+
+# quantization
+ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
+DSP_SRCS-yes += quantize.c
+DSP_SRCS-yes += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
+endif
+
+# avg
+DSP_SRCS-yes += avg.c
+DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
+DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+
+# high bit depth subtract
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c
+endif
+
+endif # CONFIG_AV1_ENCODER
+
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+DSP_SRCS-yes += sum_squares.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
+endif # CONFIG_AV1_ENCODER
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += sad.c
+DSP_SRCS-yes += subtract.c
+
+DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
+
+DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
+
+DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c
+endif
+
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
+endif #CONFIG_EXT_INTER
+ifeq ($(CONFIG_MOTION_VAR),yes)
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
+endif #CONFIG_MOTION_VAR
+ifeq ($(CONFIG_EXT_PARTITION),yes)
+DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c
+endif
+endif #CONFIG_AV1_ENCODER
+
+DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
+
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif # CONFIG_HIGHBITDEPTH
+
+endif # CONFIG_ENCODERS
+
+ifneq ($(filter yes,$(CONFIG_ENCODERS)),)
+DSP_SRCS-yes += variance.c
+DSP_SRCS-yes += variance.h
+
+DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
+
+DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
+
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm
+endif # ARCH_X86_64
+
+DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
+
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
+endif # CONFIG_HIGHBITDEPTH
+endif # CONFIG_ENCODERS
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += aom_dsp_rtcd.c
+DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
+
+DSP_SRCS-yes += aom_simd.h
+DSP_SRCS-yes += aom_simd_inline.h
+DSP_SRCS-yes += simd/v64_intrinsics.h
+DSP_SRCS-yes += simd/v64_intrinsics_c.h
+DSP_SRCS-yes += simd/v128_intrinsics.h
+DSP_SRCS-yes += simd/v128_intrinsics_c.h
+DSP_SRCS-yes += simd/v256_intrinsics.h
+DSP_SRCS-yes += simd/v256_intrinsics_c.h
+DSP_SRCS-yes += simd/v256_intrinsics_v128.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
+DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
+
+$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
new file mode 100644
index 000000000..47ffbeb6c
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MAX_SB_SIZE
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE 128
+#else
+#define MAX_SB_SIZE 64
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+#endif // ndef MAX_SB_SIZE
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+/* Left shifting a negative value became undefined behavior in C99 (downgraded
+ from merely implementation-defined in C89). This should still compile to the
+ correct thing on any two's-complement machine, but avoid ubsan warnings.*/
+#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+#define AOM_SWAP(type, a, b) \
+ do { \
+ type c = (b); \
+ b = a; \
+ a = c; \
+ } while (0)
+
+#if CONFIG_AOM_QM
+typedef uint16_t qm_val_t;
+#define AOM_QM_BITS 6
+#endif
+#if CONFIG_HIGHBITDEPTH
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif // CONFIG_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+ return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+ switch (bd) {
+ case 8:
+ default: return (uint16_t)clamp(val, 0, 255);
+ case 10: return (uint16_t)clamp(val, 0, 1023);
+ case 12: return (uint16_t)clamp(val, 0, 4095);
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
new file mode 100644
index 000000000..11a57d382
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "./aom_config.h"
+#define RTCD_C
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
new file mode 100755
index 000000000..b4ef0d92f
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -0,0 +1,1495 @@
+sub aom_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+EOF
+}
+forward_decls qw/aom_dsp_forward_decls/;
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+}
+
+if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
+ @block_widths = (4, 8, 16, 32, 64, 128)
+} else {
+ @block_widths = (4, 8, 16, 32, 64)
+}
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+ foreach $h (@block_widths) {
+ push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+ }
+}
+
+@tx_dims = (2, 4, 8, 16, 32);
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+ push @tx_dims, '64';
+}
+
+@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/;
+if (aom_config("CONFIG_ALT_INTRA") eq "yes") {
+ push @pred_names, qw/paeth smooth/;
+} else {
+ push @pred_names, 'tm';
+}
+
+#
+# Intra prediction
+#
+
+foreach $dim (@tx_dims) {
+ $w = ${dim};
+ $h = ${dim};
+ foreach $pred_name (@pred_names) {
+ add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
+ "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+ "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ }
+ }
+}
+
+specialize qw/aom_d63e_predictor_4x4 ssse3/;
+specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/aom_d135_predictor_4x4 neon/;
+specialize qw/aom_d153_predictor_4x4 ssse3/;
+specialize qw/aom_v_predictor_4x4 neon msa sse2/;
+if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/;
+} # CONFIG_ALT_INTRA
+specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/aom_d153_predictor_8x8 ssse3/;
+specialize qw/aom_v_predictor_8x8 neon msa sse2/;
+if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/;
+} # CONFIG_ALT_INTRA
+specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/aom_d153_predictor_16x16 ssse3/;
+specialize qw/aom_v_predictor_16x16 neon msa sse2/;
+if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_tm_predictor_16x16 neon msa sse2/;
+} # CONFIG_ALT_INTRA
+specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_h_predictor_32x32 neon msa sse2/;
+specialize qw/aom_d153_predictor_32x32 ssse3/;
+specialize qw/aom_v_predictor_32x32 neon msa sse2/;
+if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_tm_predictor_32x32 neon msa sse2/;
+} # CONFIG_ALT_INTRA
+specialize qw/aom_dc_predictor_32x32 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ specialize qw/aom_highbd_v_predictor_4x4 sse2/;
+ if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_highbd_tm_predictor_4x4 sse2/;
+ } # CONFIG_ALT_INTRA
+ specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
+ specialize qw/aom_highbd_v_predictor_8x8 sse2/;
+ if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_highbd_tm_predictor_8x8 sse2/;
+ } # CONFIG_ALT_INTRA
+ specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
+ specialize qw/aom_highbd_v_predictor_16x16 sse2/;
+ if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_highbd_tm_predictor_16x16 sse2/;
+ } # CONFIG_ALT_INTRA
+ specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
+ specialize qw/aom_highbd_v_predictor_32x32 sse2/;
+ if (aom_config("CONFIG_ALT_INTRA") eq "") {
+ specialize qw/aom_highbd_tm_predictor_32x32 sse2/;
+ } # CONFIG_ALT_INTRA
+ specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
+} # CONFIG_HIGHBITDEPTH
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/aom_convolve_copy sse2 /;
+specialize qw/aom_convolve_avg sse2 /;
+specialize qw/aom_convolve8 sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_avg sse2 ssse3/;
+specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/aom_convolve8_avg_vert sse2 ssse3/;
+specialize qw/aom_scaled_2d ssse3/;
+
+if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
+ add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+ add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+ add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+ specialize qw/aom_convolve8_add_src ssse3/;
+ specialize qw/aom_convolve8_add_src_horiz ssse3/;
+ specialize qw/aom_convolve8_add_src_vert ssse3/;
+} # CONFIG_LOOP_RESTORATION
+
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
+ specialize qw/aom_convolve_copy neon dspr2 msa/;
+ specialize qw/aom_convolve_avg neon dspr2 msa/;
+ specialize qw/aom_convolve8 neon dspr2 msa/;
+ specialize qw/aom_convolve8_horiz neon dspr2 msa/;
+ specialize qw/aom_convolve8_vert neon dspr2 msa/;
+ specialize qw/aom_convolve8_avg neon dspr2 msa/;
+ specialize qw/aom_convolve8_avg_horiz neon dspr2 msa/;
+ specialize qw/aom_convolve8_avg_vert neon dspr2 msa/;
+}
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+
+ add_proto qw/void aom_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve_avg sse2 avx2/;
+
+ add_proto qw/void aom_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8 avx2/, "$sse2_x86_64";
+
+ add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
+
+ add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
+
+ add_proto qw/void aom_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8_avg avx2/, "$sse2_x86_64";
+
+ add_proto qw/void aom_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8_avg_horiz avx2/, "$sse2_x86_64";
+
+ add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/aom_highbd_convolve8_avg_vert avx2/, "$sse2_x86_64";
+
+ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
+ add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+ specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
+ # The _horiz/_vert functions are currently unused, so we don't bother
+ # specialising them.
+ } # CONFIG_LOOP_RESTORATION
+} # CONFIG_HIGHBITDEPTH
+
+#
+# Loopfilter
+#
+add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
+$aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon;
+
+add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
+$aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon;
+
+add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
+$aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon;
+
+add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/;
+
+add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+
+add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
+$aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon;
+
+add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
+$aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon;
+
+add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
+$aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
+
+add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_16 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_16_dual sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_8_dual sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_4_dual sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_8_dual sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_4_dual sse2/;
+} # CONFIG_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4 sse2/;
+
+ add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4_1 sse2/;
+
+ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8_1 sse2/;
+
+ add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16 sse2/;
+
+ add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16_1 sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32 sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_rd sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_1 sse2 avx2/;
+
+ # High bit depth
+ add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct4x4 sse2/;
+
+ add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct8x8 sse2/;
+
+ add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+
+ add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct16x16 sse2/;
+
+ add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+
+ add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct32x32 sse2/;
+
+ add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct32x32_rd sse2/;
+
+ add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ } else {
+ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4 sse2 msa/;
+
+ add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4_1 sse2/;
+
+ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8_1 sse2 neon msa/;
+
+ add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16 sse2 msa/;
+
+ add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
+
+ add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32 sse2 avx2 msa/;
+
+ add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
+
+ add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+ } # CONFIG_HIGHBITDEPTH
+} # CONFIG_AV1_ENCODER
+
+#
+# Inverse transform
+if (aom_config("CONFIG_AV1") eq "yes") {
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+
+ add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_iwht4x4_16_add sse2/;
+
+ add_proto qw/void aom_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+ {
+ add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct4x4_16_add sse2/;
+
+ add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct4x4_1_add sse2/;
+
+ add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_64_add sse2 ssse3/;
+
+ add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_12_add sse2 ssse3/;
+
+ add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_1_add sse2/;
+
+ add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_256_add sse2/;
+
+ add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+
+ add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_10_add sse2/;
+
+ add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_1_add sse2/;
+
+ add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_1024_add sse2 ssse3/;
+
+ add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_135_add sse2 ssse3/;
+ # Need to add 135 eob idct32x32 implementations.
+ $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
+
+ add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_34_add sse2 ssse3/;
+
+ add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_1_add sse2/;
+
+ add_proto qw/void aom_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/aom_highbd_idct4x4_16_add sse2/;
+ }
+} else {
+ {
+ add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct4x4_16_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_256_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+
+ add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
+
+ add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
+ # Need to add 135 eob idct32x32 implementations.
+ $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
+ $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
+ $aom_idct32x32_135_add_dspr2=aom_idct32x32_1024_add_dspr2;
+ $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
+
+ add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
+ # Need to add 34 eob idct32x32 neon implementation.
+ $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
+
+ add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_idct32x32_1_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_iwht4x4_1_add msa/;
+
+ add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/aom_iwht4x4_16_add msa sse2/;
+ }
+} # CONFIG_HIGHBITDEPTH
+} # CONFIG_AV1
+
+#
+# Quantization
+#
+if (aom_config("CONFIG_AOM_QM") eq "yes") {
+ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+ add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+ add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+ add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+ add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+ } # CONFIG_HIGHBITDEPTH
+ } # CONFIG_AV1_ENCODER
+} else {
+ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+
+ add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+
+ add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b sse2/;
+
+ add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+
+ add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ } # CONFIG_HIGHBITDEPTH
+ } # CONFIG_AV1_ENCODER
+} # CONFIG_AOM_QM
+if (aom_config("CONFIG_AV1") eq "yes") {
+ #
+ # Alpha blending with mask
+ #
+ add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+ add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ specialize "aom_blend_a64_mask", qw/sse4_1/;
+ specialize "aom_blend_a64_hmask", qw/sse4_1/;
+ specialize "aom_blend_a64_vmask", qw/sse4_1/;
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+ add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+ specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+ specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
+ }
+} # CONFIG_AV1
+
+if (aom_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Block subtraction
+#
+add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/aom_subtract_block neon msa sse2/;
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+#
+# Sum of Squares
+#
+add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+specialize qw/aom_sum_squares_2d_i16 sse2/;
+
+add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+specialize qw/aom_sum_squares_i16 sse2/;
+}
+
+
+#
+# Avg
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ #
+ # Avg
+ #
+ add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/aom_avg_8x8 sse2 neon msa/;
+ add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/aom_avg_4x4 sse2 neon msa/;
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+ add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+ add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/aom_highbd_subtract_block sse2/;
+ }
+
+ #
+ # Minmax
+ #
+ add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/aom_minmax_8x8 sse2 neon/;
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ }
+
+ add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/aom_hadamard_16x16 sse2 neon/;
+
+ add_proto qw/int aom_satd/, "const int16_t *coeff, int length";
+ specialize qw/aom_satd sse2 neon/;
+
+ add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height";
+ specialize qw/aom_int_pro_row sse2 neon/;
+
+ add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width";
+ specialize qw/aom_int_pro_col sse2 neon/;
+
+ add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
+ specialize qw/aom_vector_var neon sse2/;
+} # CONFIG_AV1_ENCODER
+
+#
+# Single block SAD / Single block Avg SAD
+#
+foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+}
+
+specialize qw/aom_sad128x128 avx2 sse2/;
+specialize qw/aom_sad128x64 avx2 sse2/;
+specialize qw/aom_sad64x128 avx2 sse2/;
+specialize qw/aom_sad64x64 avx2 neon msa sse2/;
+specialize qw/aom_sad64x32 avx2 msa sse2/;
+specialize qw/aom_sad32x64 avx2 msa sse2/;
+specialize qw/aom_sad32x32 avx2 neon msa sse2/;
+specialize qw/aom_sad32x16 avx2 msa sse2/;
+specialize qw/aom_sad16x32 msa sse2/;
+specialize qw/aom_sad16x16 media neon msa sse2/;
+specialize qw/aom_sad16x8 neon msa sse2/;
+specialize qw/aom_sad8x16 neon msa sse2/;
+specialize qw/aom_sad8x8 neon msa sse2/;
+specialize qw/aom_sad8x4 msa sse2/;
+specialize qw/aom_sad4x8 msa sse2/;
+specialize qw/aom_sad4x4 neon msa sse2/;
+
+specialize qw/aom_sad128x128_avg avx2 sse2/;
+specialize qw/aom_sad128x64_avg avx2 sse2/;
+specialize qw/aom_sad64x128_avg avx2 sse2/;
+specialize qw/aom_sad64x64_avg avx2 msa sse2/;
+specialize qw/aom_sad64x32_avg avx2 msa sse2/;
+specialize qw/aom_sad32x64_avg avx2 msa sse2/;
+specialize qw/aom_sad32x32_avg avx2 msa sse2/;
+specialize qw/aom_sad32x16_avg avx2 msa sse2/;
+specialize qw/aom_sad16x32_avg msa sse2/;
+specialize qw/aom_sad16x16_avg msa sse2/;
+specialize qw/aom_sad16x8_avg msa sse2/;
+specialize qw/aom_sad8x16_avg msa sse2/;
+specialize qw/aom_sad8x8_avg msa sse2/;
+specialize qw/aom_sad8x4_avg msa sse2/;
+specialize qw/aom_sad4x8_avg msa sse2/;
+specialize qw/aom_sad4x4_avg msa sse2/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ if ($w != 128 && $h != 128 && $w != 4) {
+ specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+ specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+ }
+ }
+ specialize qw/aom_highbd_sad128x128 avx2/;
+ specialize qw/aom_highbd_sad128x64 avx2/;
+ specialize qw/aom_highbd_sad64x128 avx2/;
+ specialize qw/aom_highbd_sad64x64 avx2/;
+ specialize qw/aom_highbd_sad64x32 avx2/;
+ specialize qw/aom_highbd_sad32x64 avx2/;
+ specialize qw/aom_highbd_sad32x32 avx2/;
+ specialize qw/aom_highbd_sad32x16 avx2/;
+ specialize qw/aom_highbd_sad16x32 avx2/;
+ specialize qw/aom_highbd_sad16x16 avx2/;
+ specialize qw/aom_highbd_sad16x8 avx2/;
+
+ specialize qw/aom_highbd_sad128x128_avg avx2/;
+ specialize qw/aom_highbd_sad128x64_avg avx2/;
+ specialize qw/aom_highbd_sad64x128_avg avx2/;
+ specialize qw/aom_highbd_sad64x64_avg avx2/;
+ specialize qw/aom_highbd_sad64x32_avg avx2/;
+ specialize qw/aom_highbd_sad32x64_avg avx2/;
+ specialize qw/aom_highbd_sad32x32_avg avx2/;
+ specialize qw/aom_highbd_sad32x16_avg avx2/;
+ specialize qw/aom_highbd_sad16x32_avg avx2/;
+ specialize qw/aom_highbd_sad16x16_avg avx2/;
+ specialize qw/aom_highbd_sad16x8_avg avx2/;
+}
+
+#
+# Masked SAD
+#
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+ specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
+ }
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+ specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
+ }
+ }
+}
+
+#
+# OBMC SAD
+#
+if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+ specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
+ }
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+ specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
+ }
+ }
+}
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+foreach $s (@block_widths) {
+ add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/aom_sad64x64x3 msa/;
+specialize qw/aom_sad32x32x3 msa/;
+specialize qw/aom_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/aom_sad8x8x3 sse3 msa/;
+specialize qw/aom_sad4x4x3 sse3 msa/;
+
+add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad16x8x3 sse3 ssse3 msa/;
+add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x16x3 sse3 msa/;
+
+# Blocks of 8
+foreach $s (@block_widths) {
+ add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/aom_sad64x64x8 msa/;
+specialize qw/aom_sad32x32x8 msa/;
+specialize qw/aom_sad16x16x8 sse4_1 msa/;
+specialize qw/aom_sad8x8x8 sse4_1 msa/;
+specialize qw/aom_sad4x4x8 sse4_1 msa/;
+
+add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad16x8x8 sse4_1 msa/;
+add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x16x8 sse4_1 msa/;
+add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x4x8 msa/;
+add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad4x8x8 msa/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach $s (@block_widths) {
+ # Blocks of 3
+ add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ # Blocks of 8
+ add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ }
+ # Blocks of 3
+ add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ # Blocks of 8
+ add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+}
+
+specialize qw/aom_sad128x128x4d avx2 sse2/;
+specialize qw/aom_sad128x64x4d avx2 sse2/;
+specialize qw/aom_sad64x128x4d avx2 sse2/;
+specialize qw/aom_sad64x64x4d avx2 neon msa sse2/;
+specialize qw/aom_sad64x32x4d avx2 msa sse2/;
+specialize qw/aom_sad32x64x4d avx2 msa sse2/;
+specialize qw/aom_sad32x32x4d avx2 neon msa sse2/;
+specialize qw/aom_sad32x16x4d msa sse2/;
+specialize qw/aom_sad16x32x4d msa sse2/;
+specialize qw/aom_sad16x16x4d neon msa sse2/;
+specialize qw/aom_sad16x8x4d msa sse2/;
+specialize qw/aom_sad8x16x4d msa sse2/;
+specialize qw/aom_sad8x8x4d msa sse2/;
+specialize qw/aom_sad8x4x4d msa sse2/;
+specialize qw/aom_sad4x8x4d msa sse2/;
+specialize qw/aom_sad4x4x4d msa sse2/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ #
+ # Multi-block SAD, comparing a reference to N independent blocks
+ #
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ if ($w != 128 && $h != 128) {
+ specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+ }
+ }
+ specialize qw/aom_highbd_sad128x128x4d avx2/;
+ specialize qw/aom_highbd_sad128x64x4d avx2/;
+ specialize qw/aom_highbd_sad64x128x4d avx2/;
+ specialize qw/aom_highbd_sad64x64x4d avx2/;
+ specialize qw/aom_highbd_sad64x32x4d avx2/;
+ specialize qw/aom_highbd_sad32x64x4d avx2/;
+ specialize qw/aom_highbd_sad32x32x4d avx2/;
+ specialize qw/aom_highbd_sad32x16x4d avx2/;
+ specialize qw/aom_highbd_sad16x32x4d avx2/;
+ specialize qw/aom_highbd_sad16x16x4d avx2/;
+ specialize qw/aom_highbd_sad16x8x4d avx2/;
+}
+
+#
+# Structured Similarity (SSIM)
+#
+if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+ add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ }
+}
+} # CONFIG_ENCODERS
+
+if (aom_config("CONFIG_ENCODERS") eq "yes") {
+
+#
+# Specialty Variance
+#
+add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+specialize qw/aom_get16x16var sse2 avx2 neon msa/;
+specialize qw/aom_get8x8var sse2 neon msa/;
+
+
+add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+
+specialize qw/aom_mse16x16 sse2 avx2 media neon msa/;
+specialize qw/aom_mse16x8 sse2 msa/;
+specialize qw/aom_mse8x16 sse2 msa/;
+specialize qw/aom_mse8x8 sse2 msa/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach $bd (8, 10, 12) {
+ add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+
+ specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
+ specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+ }
+}
+
+#
+# ...
+#
+add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/aom_upsampled_pred sse2/;
+add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride";
+ specialize qw/aom_highbd_upsampled_pred sse2/;
+ add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+ specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+}
+
+#
+# ...
+#
+add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+
+specialize qw/aom_get_mb_ss sse2 msa/;
+specialize qw/aom_get4x4sse_cs neon msa/;
+
+#
+# Variance / Subpixel Variance / Subpixel Avg Variance
+#
+ add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
+
+specialize qw/aom_variance64x64 sse2 avx2 neon msa/;
+specialize qw/aom_variance64x32 sse2 avx2 neon msa/;
+specialize qw/aom_variance32x64 sse2 neon msa/;
+specialize qw/aom_variance32x32 sse2 avx2 neon msa/;
+specialize qw/aom_variance32x16 sse2 avx2 msa/;
+specialize qw/aom_variance16x32 sse2 msa/;
+specialize qw/aom_variance16x16 sse2 avx2 media neon msa/;
+specialize qw/aom_variance16x8 sse2 neon msa/;
+specialize qw/aom_variance8x16 sse2 neon msa/;
+specialize qw/aom_variance8x8 sse2 media neon msa/;
+specialize qw/aom_variance8x4 sse2 msa/;
+specialize qw/aom_variance4x8 sse2 msa/;
+specialize qw/aom_variance4x4 sse2 msa/;
+
+specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x16 media neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x8 media neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/;
+
+specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach $bd (8, 10, 12) {
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+ specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+ }
+ if ($w == 4 && $h == 4) {
+ specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+ }
+ if ($w != 128 && $h != 128 && $w != 4) {
+ specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+ }
+ if ($w == 4 && $h == 4) {
+ specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+ }
+ }
+ }
+} # CONFIG_HIGHBITDEPTH
+
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+#
+# Masked Variance / Masked Subpixel Variance
+#
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+ specialize "aom_masked_variance${w}x${h}", qw/ssse3/;
+ specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+ }
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach $bd ("_", "_10_", "_12_") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+ specialize "aom_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
+ specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+ }
+ }
+ }
+}
+
+#
+# OBMC Variance / OBMC Subpixel Variance
+#
+if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+ }
+
+ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ foreach $bd ("_", "_10_", "_12_") {
+ foreach (@block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+ }
+ }
+ }
+}
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+
+add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_variance_halfpixvar16x16_h sse2 media/;
+
+add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_variance_halfpixvar16x16_v sse2 media/;
+
+add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_variance_halfpixvar16x16_hv sse2 media/;
+
+#
+# Comp Avg
+#
+add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance64x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance64x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance32x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance32x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance32x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance16x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance16x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance8x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance8x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance64x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance64x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance32x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance32x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance32x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance16x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance16x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance8x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance8x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance64x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance64x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance32x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance32x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance32x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance16x32 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance16x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance8x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance8x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_mse16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_mse8x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_mse16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_mse8x8 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_mse16x16 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_mse8x8 sse2/;
+
+ add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+ #
+ # Subpixel Variance
+ #
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+} # CONFIG_HIGHBITDEPTH
+
+} # CONFIG_ENCODERS
+
+1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
new file mode 100644
index 000000000..04d113dd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_FILTER_H_
+#define AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
new file mode 100644
index 000000000..469fd8ed2
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_AOM_SIMD_H_
+#define AOM_DSP_AOM_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "./aom_config.h"
+#include "./aom_simd_inline.h"
+
+#define SIMD_CHECK 1 // Sanity checks in C equivalents
+
+#if HAVE_NEON
+#include "simd/v256_intrinsics_arm.h"
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif // AOM_DSP_AOM_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
new file mode 100644
index 000000000..02a8b3a17
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#endif // AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
new file mode 100644
index 000000000..09429d6d2
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+ int16x4_t dsrc2, int16x4_t dsrc3,
+ int16x4_t dsrc4, int16x4_t dsrc5,
+ int16x4_t dsrc6, int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w, int h) {
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_y;
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 =
+ vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+ d1x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 =
+ vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+ d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_x;
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+ d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
new file mode 100644
index 000000000..80aef992d
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; AV1_FILTER_WEIGHT == 128
+ ; AV1_FILTER_SHIFT == 7
+
+ EXPORT |aom_convolve8_avg_horiz_neon|
+ EXPORT |aom_convolve8_avg_vert_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|aom_convolve8_avg_horiz_neon| PROC
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r5, [sp, #32] ; filter_x
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ sub r8, r1, r1, lsl #2 ; -src_stride * 3
+ add r8, r8, #4 ; -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 ; -dst_stride * 3
+ add r4, r4, #4 ; -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 ; reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+aom_convolve8_avg_loop_horiz_v
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ ; save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+aom_convolve8_avg_loop_horiz
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ ; extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ ; slightly out of order load to match the existing data
+ vld1.u32 {d6[0]}, [r2], r3
+ vld1.u32 {d7[0]}, [r2], r3
+ vld1.u32 {d6[1]}, [r2], r3
+ vld1.u32 {d7[1]}, [r2], r3
+
+ sub r2, r2, r3, lsl #2 ; reset for store
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ ; average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ vst1.u32 {d2[0]}, [r2@32], r3
+ vst1.u32 {d3[0]}, [r2@32], r3
+ vst1.u32 {d2[1]}, [r2@32], r3
+ vst1.u32 {d3[1]}, [r2@32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 ; w -= 4
+ bgt aom_convolve8_avg_loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r9 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt aom_convolve8_avg_loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+|aom_convolve8_avg_vert_neon| PROC
+ push {r4-r8, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] ; filter_y
+ ldr r6, [sp, #40] ; w
+ ldr lr, [sp, #44] ; h
+
+ vld1.s16 {q0}, [r4] ; filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+aom_convolve8_avg_loop_vert_h
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr ; h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+aom_convolve8_avg_loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ ; extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ vld1.u32 {d6[0]}, [r5@32], r3
+ vld1.u32 {d6[1]}, [r8@32], r3
+ vld1.u32 {d7[0]}, [r5@32], r3
+ vld1.u32 {d7[1]}, [r8@32], r3
+
+ pld [r7]
+ pld [r4]
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r5]
+ pld [r8]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ sub r5, r5, r3, lsl #1 ; reset for store
+ sub r8, r8, r3, lsl #1
+
+ vst1.u32 {d2[0]}, [r5@32], r3
+ vst1.u32 {d2[1]}, [r8@32], r3
+ vst1.u32 {d3[0]}, [r5@32], r3
+ vst1.u32 {d3[1]}, [r8@32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 ; h -= 4
+ bgt aom_convolve8_avg_loop_vert
+
+ ; outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 ; w -= 4
+ bgt aom_convolve8_avg_loop_vert_h
+
+ pop {r4-r8, pc}
+
+ ENDP
+ END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
new file mode 100644
index 000000000..8ebffb5f9
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+ int16x4_t dsrc2, int16x4_t dsrc3,
+ int16x4_t dsrc4, int16x4_t dsrc5,
+ int16x4_t dsrc6, int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w, int h) {
+ int width;
+ const uint8_t *s, *psrc;
+ uint8_t *d, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_y;
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4, src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 =
+ vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst; width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+ d1x2u16 =
+ vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 =
+ vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+ d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_x;
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+ d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+ d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
new file mode 100644
index 000000000..38207d864
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
@@ -0,0 +1,273 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; AV1_FILTER_WEIGHT == 128
+ ; AV1_FILTER_SHIFT == 7
+
+ EXPORT |aom_convolve8_horiz_neon|
+ EXPORT |aom_convolve8_vert_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|aom_convolve8_horiz_neon| PROC
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r5, [sp, #32] ; filter_x
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ sub r8, r1, r1, lsl #2 ; -src_stride * 3
+ add r8, r8, #4 ; -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 ; -dst_stride * 3
+ add r4, r4, #4 ; -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 ; reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+aom_convolve8_loop_horiz_v
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ ; save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+aom_convolve8_loop_horiz
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ ; extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ vst1.u32 {d2[0]}, [r2@32], r3
+ vst1.u32 {d3[0]}, [r2@32], r3
+ vst1.u32 {d2[1]}, [r2@32], r3
+ vst1.u32 {d3[1]}, [r2@32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 ; w -= 4
+ bgt aom_convolve8_loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r9 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt aom_convolve8_loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+|aom_convolve8_vert_neon| PROC
+ push {r4-r8, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] ; filter_y
+ ldr r6, [sp, #40] ; w
+ ldr lr, [sp, #44] ; h
+
+ vld1.s16 {q0}, [r4] ; filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+aom_convolve8_loop_vert_h
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr ; h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+aom_convolve8_loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ ; extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ pld [r5]
+ pld [r8]
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r7]
+ pld [r4]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ vst1.u32 {d2[0]}, [r5@32], r3
+ vst1.u32 {d2[1]}, [r8@32], r3
+ vst1.u32 {d3[0]}, [r5@32], r3
+ vst1.u32 {d3[1]}, [r8@32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 ; h -= 4
+ bgt aom_convolve8_loop_vert
+
+ ; outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 ; w -= 4
+ bgt aom_convolve8_loop_vert_h
+
+ pop {r4-r8, pc}
+
+ ENDP
+ END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
new file mode 100644
index 000000000..f05d3ceae
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_convolve_avg_neon(const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
new file mode 100644
index 000000000..43c300954
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
@@ -0,0 +1,119 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_convolve_avg_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|aom_convolve_avg_neon| PROC
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #32]
+ mov r6, r2
+
+ cmp r4, #32
+ bgt avg64
+ beq avg32
+ cmp r4, #8
+ bgt avg16
+ beq avg8
+ b avg4
+
+avg64
+ sub lr, r1, #32
+ sub r4, r3, #32
+avg64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ pld [r2, r3]
+ vld1.8 {q8-q9}, [r6@128]!
+ vld1.8 {q10-q11}, [r6@128], r4
+ vrhadd.u8 q0, q0, q8
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r4
+ subs r5, r5, #1
+ bgt avg64_h
+ pop {r4-r6, pc}
+
+avg32
+ vld1.8 {q0-q1}, [r0], r1
+ vld1.8 {q2-q3}, [r0], r1
+ vld1.8 {q8-q9}, [r6@128], r3
+ vld1.8 {q10-q11}, [r6@128], r3
+ pld [r0]
+ vrhadd.u8 q0, q0, q8
+ pld [r0, r1]
+ vrhadd.u8 q1, q1, q9
+ pld [r6]
+ vrhadd.u8 q2, q2, q10
+ pld [r6, r3]
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg32
+ pop {r4-r6, pc}
+
+avg16
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q2}, [r6@128], r3
+ vld1.8 {q3}, [r6@128], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q2
+ pld [r6]
+ pld [r6, r3]
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg16
+ pop {r4-r6, pc}
+
+avg8
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r6@64], r3
+ vld1.8 {d3}, [r6@64], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q1
+ pld [r6]
+ pld [r6, r3]
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d1}, [r2@64], r3
+ subs r5, r5, #2
+ bgt avg8
+ pop {r4-r6, pc}
+
+avg4
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[0]}, [r6@32], r3
+ vld1.32 {d2[1]}, [r6@32], r3
+ vrhadd.u8 d0, d0, d2
+ vst1.32 {d0[0]}, [r2@32], r3
+ vst1.32 {d0[1]}, [r2@32], r3
+ subs r5, r5, #2
+ bgt avg4
+ pop {r4-r6, pc}
+ ENDP
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 000000000..9e57c7176
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
new file mode 100644
index 000000000..443d7178a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
@@ -0,0 +1,87 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_convolve_copy_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|aom_convolve_copy_neon| PROC
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #28]
+
+ cmp r4, #32
+ bgt copy64
+ beq copy32
+ cmp r4, #8
+ bgt copy16
+ beq copy8
+ b copy4
+
+copy64
+ sub lr, r1, #32
+ sub r3, r3, #32
+copy64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #1
+ bgt copy64_h
+ pop {r4-r5, pc}
+
+copy32
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q2-q3}, [r0], r1
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy32
+ pop {r4-r5, pc}
+
+copy16
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q1}, [r0], r1
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy16
+ pop {r4-r5, pc}
+
+copy8
+ pld [r0, r1, lsl #1]
+ vld1.8 {d0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {d2}, [r0], r1
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d2}, [r2@64], r3
+ subs r5, r5, #2
+ bgt copy8
+ pop {r4-r5, pc}
+
+copy4
+ ldr r12, [r0], r1
+ str r12, [r2], r3
+ subs r5, r5, #1
+ bgt copy4
+ pop {r4-r5, pc}
+ ENDP
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
new file mode 100644
index 000000000..6c2997e04
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the
+ * given height and filter a multiple of 4 lines. Since this goes in to
+ * the temp buffer which has lots of extra room and is subsequently discarded
+ * this is safe if somewhat less than ideal.
+ */
+ aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w,
+ intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w,
+ intermediate_height);
+ aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
new file mode 100644
index 000000000..e730ccbcc
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+ const uint32x4_t a = vpaddlq_u16(v_16x8);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) {
+ uint16x8_t v_sum;
+ uint32x2_t v_s0 = vdup_n_u32(0);
+ uint32x2_t v_s1 = vdup_n_u32(0);
+ v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+ v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+ v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+ return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) {
+ uint8x8_t v_s0 = vld1_u8(s);
+ const uint8x8_t v_s1 = vld1_u8(s + p);
+ uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+ v_s0 = vld1_u8(s + 2 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 3 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 4 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 5 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 6 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 7 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_neon(const int16_t *coeff, int length) {
+ const int16x4_t zero = vdup_n_s16(0);
+ int32x4_t accum = vdupq_n_s32(0);
+
+ do {
+ const int16x8_t src0 = vld1q_s16(coeff);
+ const int16x8_t src8 = vld1q_s16(coeff + 8);
+ accum = vabal_s16(accum, vget_low_s16(src0), zero);
+ accum = vabal_s16(accum, vget_high_s16(src0), zero);
+ accum = vabal_s16(accum, vget_low_s16(src8), zero);
+ accum = vabal_s16(accum, vget_high_s16(src8), zero);
+ length -= 16;
+ coeff += 16;
+ } while (length != 0);
+
+ {
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'.
+ const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+ vreinterpret_s32_s64(vget_high_s64(s0)));
+ const int satd = vget_lane_s32(s1, 0);
+ return satd;
+ }
+}
+
+void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride,
+ int height) {
+ int i;
+ uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+ const int shift_factor = ((height >> 5) + 3) * -1;
+ const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+ for (i = 0; i < height; i += 8) {
+ const uint8x16_t vec_row1 = vld1q_u8(ref);
+ const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+ const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+ const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+ const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+ const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+ const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+ const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+ ref += ref_stride * 8;
+ }
+
+ vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+ vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+ hbuf += 8;
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
+ int i;
+ uint16x8_t vec_sum = vdupq_n_u16(0);
+
+ for (i = 0; i < width; i += 16) {
+ const uint8x16_t vec_row = vld1q_u8(ref);
+ vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+ vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+ ref += 16;
+ }
+
+ return horizontal_add_u16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) {
+ int width = 4 << bwl;
+ int32x4_t sse = vdupq_n_s32(0);
+ int16x8_t total = vdupq_n_s16(0);
+
+ assert(width >= 8);
+ assert((width % 8) == 0);
+
+ do {
+ const int16x8_t r = vld1q_s16(ref);
+ const int16x8_t s = vld1q_s16(src);
+ const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.
+ const int16x4_t diff_lo = vget_low_s16(diff);
+ const int16x4_t diff_hi = vget_high_s16(diff);
+ sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.
+ sse = vmlal_s16(sse, diff_hi, diff_hi);
+ total = vaddq_s16(total, diff); // dynamic range 16 bits.
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ {
+ // Note: 'total''s pairwise addition could be implemented similarly to
+ // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+ // with the summation of 'sse' performed better on a Cortex-A15.
+ const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
+ const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+ const int32x2_t t2 = vpadd_s32(t1, t1);
+ const int t = vget_lane_s32(t2, 0);
+ const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.
+ const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+ vreinterpret_s32_s64(vget_high_s64(s0)));
+ const int s = vget_lane_s32(s1, 0);
+ const int shift_factor = bwl + 2;
+ return s - ((t * t) >> shift_factor);
+ }
+}
+
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+ const uint8x16_t a23 =
+ vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+ const uint8x16_t a45 =
+ vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+ const uint8x16_t a67 =
+ vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+ const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+ const uint8x16_t b23 =
+ vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+ const uint8x16_t b45 =
+ vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+ const uint8x16_t b67 =
+ vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+ // Split to D and start doing pairwise.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propogate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}
diff --git a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm
new file mode 100644
index 000000000..17b7d25f9
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm
@@ -0,0 +1,240 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_filter_block2d_bil_first_pass_media|
+ EXPORT |aom_filter_block2d_bil_second_pass_media|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned short *dst_ptr,
+; r2 unsigned int src_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *aom_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|aom_filter_block2d_bil_first_pass_media| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; aom_filter address
+ ldr r4, [sp, #36] ; width
+
+ mov r12, r3 ; outer-loop counter
+
+ add r7, r2, r4 ; preload next row
+ pld [r0, r7]
+
+ sub r2, r2, r4 ; src increment for height loop
+
+ ldr r5, [r11] ; load up filter coefficients
+
+ mov r3, r3, lsl #1 ; height*2
+ add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+ mov r11, r1 ; save dst_ptr for each row
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+ ldrb r6, [r0] ; load source data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ mov lr, r4, lsr #2 ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+ ldrb r9, [r0, #3]
+ ldrb r10, [r0, #4]
+
+ pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
+
+ smuad r6, r6, r5 ; apply the filter
+ pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
+ smuad r7, r7, r5
+ pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
+
+ smuad r8, r8, r5
+ smuad r9, r9, r5
+
+ add r0, r0, #4
+ subs lr, lr, #1
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #16, r6, asr #7
+ usat r7, #16, r7, asr #7
+
+ strh r6, [r1], r3 ; result is transposed and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strh r7, [r1], r3
+ add r9, r9, #0x40
+ usat r8, #16, r8, asr #7
+ usat r9, #16, r9, asr #7
+
+ strh r8, [r1], r3 ; result is transposed and stored
+
+ ldrneb r6, [r0] ; load source data
+ strh r9, [r1], r3
+
+ ldrneb r7, [r0, #1]
+ ldrneb r8, [r0, #2]
+
+ bne bil_width_loop_1st_v6
+
+ add r0, r0, r2 ; move to next input row
+ subs r12, r12, #1
+
+ add r9, r2, r4, lsl #1 ; adding back block width
+ pld [r0, r9] ; preload next row
+
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_1st_v6
+
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+ mov lr, r4, lsr #2 ; loop counter
+
+|bil_width_loop_null_1st|
+ ldrb r6, [r0] ; load data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ ldrb r9, [r0, #3]
+
+ strh r6, [r1], r3 ; store it to immediate buffer
+ add r0, r0, #4
+ strh r7, [r1], r3
+ subs lr, lr, #1
+ strh r8, [r1], r3
+ strh r9, [r1], r3
+
+ bne bil_width_loop_null_1st
+
+ subs r12, r12, #1
+ add r0, r0, r2 ; move to next input line
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_null_1st
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |aom_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0 unsigned short *src_ptr,
+; r1 unsigned char *dst_ptr,
+; r2 int dst_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *aom_filter
+;---------------------------------
+|aom_filter_block2d_bil_second_pass_media| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; aom_filter address
+ ldr r4, [sp, #36] ; width
+
+ ldr r5, [r11] ; load up filter coefficients
+ mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
+ mov r11, r1
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+ ldr r6, [r0] ; load the data
+ ldr r8, [r0, #4]
+ ldrh r10, [r0, #8]
+ mov lr, r3, lsr #2 ; loop counter
+
+|bil_width_loop_2nd|
+ pkhtb r7, r6, r8 ; src[1] | src[2]
+ pkhtb r9, r8, r10 ; src[3] | src[4]
+
+ smuad r6, r6, r5 ; apply filter
+ smuad r8, r8, r5 ; apply filter
+
+ subs lr, lr, #1
+
+ smuadx r7, r7, r5 ; apply filter
+ smuadx r9, r9, r5 ; apply filter
+
+ add r0, r0, #8
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #8, r6, asr #7
+ usat r7, #8, r7, asr #7
+ strb r6, [r1], r2 ; the result is transposed back and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strb r7, [r1], r2
+ add r9, r9, #0x40
+ usat r8, #8, r8, asr #7
+ usat r9, #8, r9, asr #7
+ strb r8, [r1], r2 ; the result is transposed back and stored
+
+ ldrne r6, [r0] ; load data
+ strb r9, [r1], r2
+ ldrne r8, [r0, #4]
+ ldrneh r10, [r0, #8]
+
+ bne bil_width_loop_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4 ; update src for next row
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_2nd
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+ mov lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+ ldr r6, [r0], #4 ; load data
+ subs lr, lr, #1
+ ldr r8, [r0], #4
+
+ strb r6, [r1], r2 ; store data
+ mov r7, r6, lsr #16
+ strb r7, [r1], r2
+ mov r9, r8, lsr #16
+ strb r8, [r1], r2
+ strb r9, [r1], r2
+
+ bne bil_width_loop_null_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_null_2nd
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |aom_filter_block2d_second_pass_media|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 000000000..1cf8a3a6e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ int i;
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from aom_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
+
+void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+ {
+ const int32x4_t a = vpaddlq_s16(sum);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+ output[1] = 0;
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 000000000..9baefae47
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ const int16x8_t b0 = vaddq_s16(*a0, *a1);
+ const int16x8_t b1 = vsubq_s16(*a0, *a1);
+ const int16x8_t b2 = vaddq_s16(*a2, *a3);
+ const int16x8_t b3 = vsubq_s16(*a2, *a3);
+ const int16x8_t b4 = vaddq_s16(*a4, *a5);
+ const int16x8_t b5 = vsubq_s16(*a4, *a5);
+ const int16x8_t b6 = vaddq_s16(*a6, *a7);
+ const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c4 = vaddq_s16(b4, b6);
+ const int16x8_t c5 = vaddq_s16(b5, b7);
+ const int16x8_t c6 = vsubq_s16(b4, b6);
+ const int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a1 = vsubq_s16(c2, c6);
+ *a2 = vsubq_s16(c0, c4);
+ *a3 = vaddq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+ *a6 = vsubq_s16(c1, c5);
+ *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ // Swap 64 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 08 09 10 11 12 13 14 15
+ // a2: 16 17 18 19 20 21 22 23
+ // a3: 24 25 26 27 28 29 30 31
+ // a4: 32 33 34 35 36 37 38 39
+ // a5: 40 41 42 43 44 45 46 47
+ // a6: 48 49 50 51 52 53 54 55
+ // a7: 56 57 58 59 60 61 62 63
+ // to:
+ // a04_lo: 00 01 02 03 32 33 34 35
+ // a15_lo: 08 09 10 11 40 41 42 43
+ // a26_lo: 16 17 18 19 48 49 50 51
+ // a37_lo: 24 25 26 27 56 57 58 59
+ // a04_hi: 04 05 06 07 36 37 38 39
+ // a15_hi: 12 13 14 15 44 45 46 47
+ // a26_hi: 20 21 22 23 52 53 54 55
+ // a37_hi: 28 29 30 31 60 61 62 63
+ const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+ const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+ const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+ const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+ const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+ const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+ const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+ const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+ // Swap 32 bit elements resulting in:
+ // a0246_lo:
+ // 00 01 16 17 32 33 48 49
+ // 02 03 18 19 34 35 50 51
+ // a1357_lo:
+ // 08 09 24 25 40 41 56 57
+ // 10 11 26 27 42 43 58 59
+ // a0246_hi:
+ // 04 05 20 21 36 37 52 53
+ // 06 07 22 23 38 39 54 55
+ // a1657_hi:
+ // 12 13 28 29 44 45 60 61
+ // 14 15 30 31 46 47 62 63
+ const int32x4x2_t a0246_lo =
+ vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+ const int32x4x2_t a1357_lo =
+ vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+ const int32x4x2_t a0246_hi =
+ vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+ const int32x4x2_t a1357_hi =
+ vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
+
+ // Swap 16 bit elements resulting in:
+ // b0:
+ // 00 08 16 24 32 40 48 56
+ // 01 09 17 25 33 41 49 57
+ // b1:
+ // 02 10 18 26 34 42 50 58
+ // 03 11 19 27 35 43 51 59
+ // b2:
+ // 04 12 20 28 36 44 52 60
+ // 05 13 21 29 37 45 53 61
+ // b3:
+ // 06 14 22 30 38 46 54 62
+ // 07 15 23 31 39 47 55 63
+ const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+ vreinterpretq_s16_s32(a1357_lo.val[0]));
+ const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+ vreinterpretq_s16_s32(a1357_lo.val[1]));
+ const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+ vreinterpretq_s16_s32(a1357_hi.val[0]));
+ const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+ vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+ *a0 = b0.val[0];
+ *a1 = b0.val[1];
+ *a2 = b1.val[0];
+ *a3 = b1.val[1];
+ *a4 = b2.val[0];
+ *a5 = b2.val[1];
+ *a6 = b3.val[0];
+ *a7 = b3.val[1];
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ vst1q_s16(coeff + 0, a0);
+ vst1q_s16(coeff + 8, a1);
+ vst1q_s16(coeff + 16, a2);
+ vst1q_s16(coeff + 24, a3);
+ vst1q_s16(coeff + 32, a4);
+ vst1q_s16(coeff + 40, a5);
+ vst1q_s16(coeff + 48, a6);
+ vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int i;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = vld1q_s16(coeff + 0);
+ const int16x8_t a1 = vld1q_s16(coeff + 64);
+ const int16x8_t a2 = vld1q_s16(coeff + 128);
+ const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+
+ vst1q_s16(coeff + 0, c0);
+ vst1q_s16(coeff + 64, c1);
+ vst1q_s16(coeff + 128, c2);
+ vst1q_s16(coeff + 192, c3);
+
+ coeff += 8;
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm
new file mode 100644
index 000000000..d01c4bc03
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm
@@ -0,0 +1,201 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+
+ EXPORT |aom_idct16x16_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct16x16_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asr r0, r0, #6 ; >> 6
+
+ vdup.s16 q0, r0 ; duplicate a1
+ mov r0, #8
+ sub r2, #8
+
+ ; load destination data row0 - row3
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row4 - row7
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row8 - row11
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row12 - row15
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |aom_idct16x16_1_add_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 000000000..196b2a890
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, j, a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ for (d1 = d2 = dest, i = 0; i < 4; i++) {
+ for (j = 0; j < 2; j++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm
new file mode 100644
index 000000000..4a8f8f183
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm
@@ -0,0 +1,1182 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_idct16x16_256_add_neon_pass1|
+ EXPORT |aom_idct16x16_256_add_neon_pass2|
+ EXPORT |aom_idct16x16_10_add_neon_pass1|
+ EXPORT |aom_idct16x16_10_add_neon_pass2|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input,
+; int16_t *output, int output_stride)
+;
+; r0 int16_t input
+; r1 int16_t *output
+; r2 int output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|aom_idct16x16_256_add_neon_pass1| PROC
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q1,q2}, [r0]!
+ vmov.s16 q15, q1
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0xc00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r12, #0x3e00
+ add r12, #0xc5
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r12 ; duplicate cospi_4_64
+
+ ; preloading to avoid stall
+ ; generate cospi_12_64 = 13623
+ mov r3, #0x3500
+ add r3, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r12, #0x2300
+ add r12, #0x8e
+
+ ; step2[4] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; step2[4] * cospi_4_64
+ vmull.s16 q5, d18, d1
+ vmull.s16 q6, d19, d1
+
+ ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+ vmlal.s16 q5, d30, d0
+ vmlal.s16 q6, d31, d0
+
+ vdup.16 d2, r3 ; duplicate cospi_12_64
+ vdup.16 d3, r12 ; duplicate cospi_20_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d14, q5, #14 ; >> 14
+ vqrshrn.s32 d15, q6, #14 ; >> 14
+
+ ; preloading to avoid stall
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r12, #0x1800
+ add r12, #0x7e
+
+ ; step2[5] * cospi_12_64
+ vmull.s16 q2, d26, d2
+ vmull.s16 q3, d27, d2
+
+ ; step2[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q15, d27, d3
+
+ ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q2, d22, d3
+ vmlsl.s16 q3, d23, d3
+
+ ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q15, d23, d2
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q2, #14 ; >> 14
+ vqrshrn.s32 d11, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q15, #14 ; >> 14
+
+ ; stage 4
+ vdup.16 d30, r3 ; cospi_16_64
+
+ ; step1[0] * cospi_16_64
+ vmull.s16 q2, d16, d30
+ vmull.s16 q11, d17, d30
+
+ ; step1[1] * cospi_16_64
+ vmull.s16 q0, d24, d30
+ vmull.s16 q1, d25, d30
+
+ ; generate cospi_8_64 = 15137
+ mov r3, #0x3b00
+ add r3, #0x21
+
+ vdup.16 d30, r12 ; duplicate cospi_24_64
+ vdup.16 d31, r3 ; duplicate cospi_8_64
+
+ ; temp1 = (step1[0] + step1[1]) * cospi_16_64
+ vadd.s32 q3, q2, q0
+ vadd.s32 q12, q11, q1
+
+ ; temp2 = (step1[0] - step1[1]) * cospi_16_64
+ vsub.s32 q13, q2, q0
+ vsub.s32 q1, q11, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d16, q3, #14 ; >> 14
+ vqrshrn.s32 d17, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d18, q13, #14 ; >> 14
+ vqrshrn.s32 d19, q1, #14 ; >> 14
+
+ ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ ; step1[2] * cospi_8_64
+ vmull.s16 q0, d20, d31
+ vmull.s16 q1, d21, d31
+
+ ; step1[2] * cospi_24_64
+ vmull.s16 q12, d20, d30
+ vmull.s16 q13, d21, d30
+
+ ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q0, d28, d30
+ vmlal.s16 q1, d29, d30
+
+ ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q12, d28, d31
+ vmlsl.s16 q13, d29, d31
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d22, q0, #14 ; >> 14
+ vqrshrn.s32 d23, q1, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d20, q12, #14 ; >> 14
+ vqrshrn.s32 d21, q13, #14 ; >> 14
+
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5];
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5];
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7];
+ vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7];
+
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ ; stage 5
+ vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3];
+ vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2];
+ vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2];
+ vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3];
+
+ vdup.16 d16, r3; ; duplicate cospi_16_64
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q11, d26, d16
+ vmull.s16 q12, d27, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+ vsub.s32 q6, q9, q11
+ vsub.s32 q13, q10, q12
+
+ ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+ vadd.s32 q9, q9, q11
+ vadd.s32 q10, q10, q12
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q6, #14 ; >> 14
+ vqrshrn.s32 d11, q13, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q10, #14 ; >> 14
+
+ ; stage 6
+ vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7];
+
+ ; store the data
+ vst1.64 {d16}, [r1], r2
+ vst1.64 {d17}, [r1], r2
+ vst1.64 {d18}, [r1], r2
+ vst1.64 {d19}, [r1], r2
+ vst1.64 {d20}, [r1], r2
+ vst1.64 {d21}, [r1], r2
+ vst1.64 {d22}, [r1], r2
+ vst1.64 {d23}, [r1], r2
+ vst1.64 {d24}, [r1], r2
+ vst1.64 {d25}, [r1], r2
+ vst1.64 {d26}, [r1], r2
+ vst1.64 {d27}, [r1], r2
+ vst1.64 {d28}, [r1], r2
+ vst1.64 {d29}, [r1], r2
+ vst1.64 {d30}, [r1], r2
+ vst1.64 {d31}, [r1], r2
+
+ bx lr
+ ENDP ; |aom_idct16x16_256_add_neon_pass1|
+
+;void aom_idct16x16_256_add_neon_pass2(int16_t *src,
+; int16_t *output,
+; int16_t *pass1Output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t *src
+; r1 int16_t *output,
+; r2 int16_t *pass1Output,
+; r3 int16_t skip_adding,
+; r4 uint8_t *dest,
+; r5 int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|aom_idct16x16_256_add_neon_pass2| PROC
+ push {r3-r9}
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q0,q1}, [r0]!
+ vmov.s16 q15, q0;
+
+ ; generate cospi_30_64 = 1606
+ mov r3, #0x0600
+ add r3, #0x46
+
+ ; generate cospi_2_64 = 16305
+ mov r12, #0x3f00
+ add r12, #0xb1
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 d12, r3 ; duplicate cospi_30_64
+ vdup.16 d13, r12 ; duplicate cospi_2_64
+
+ ; preloading to avoid stall
+ ; generate cospi_14_64 = 12665
+ mov r3, #0x3100
+ add r3, #0x79
+
+ ; generate cospi_18_64 = 10394
+ mov r12, #0x2800
+ add r12, #0x9a
+
+ ; step1[8] * cospi_30_64
+ vmull.s16 q2, d16, d12
+ vmull.s16 q3, d17, d12
+
+ ; step1[8] * cospi_2_64
+ vmull.s16 q1, d16, d13
+ vmull.s16 q4, d17, d13
+
+ ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+ vmlsl.s16 q2, d30, d13
+ vmlsl.s16 q3, d31, d13
+
+ ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+ vmlal.s16 q1, d30, d12
+ vmlal.s16 q4, d31, d12
+
+ vdup.16 d30, r3 ; duplicate cospi_14_64
+ vdup.16 d31, r12 ; duplicate cospi_18_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d0, q2, #14 ; >> 14
+ vqrshrn.s32 d1, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d14, q1, #14 ; >> 14
+ vqrshrn.s32 d15, q4, #14 ; >> 14
+
+ ; preloading to avoid stall
+ ; generate cospi_22_64 = 7723
+ mov r3, #0x1e00
+ add r3, #0x2b
+
+ ; generate cospi_10_64 = 14449
+ mov r12, #0x3800
+ add r12, #0x71
+
+ ; step1[9] * cospi_14_64
+ vmull.s16 q2, d24, d30
+ vmull.s16 q3, d25, d30
+
+ ; step1[9] * cospi_18_64
+ vmull.s16 q4, d24, d31
+ vmull.s16 q5, d25, d31
+
+ ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+ vmlsl.s16 q2, d22, d31
+ vmlsl.s16 q3, d23, d31
+
+ ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+ vmlal.s16 q4, d22, d30
+ vmlal.s16 q5, d23, d30
+
+ vdup.16 d30, r3 ; duplicate cospi_22_64
+ vdup.16 d31, r12 ; duplicate cospi_10_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q2, #14 ; >> 14
+ vqrshrn.s32 d3, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q4, #14 ; >> 14
+ vqrshrn.s32 d13, q5, #14 ; >> 14
+
+ ; step1[10] * cospi_22_64
+ vmull.s16 q11, d20, d30
+ vmull.s16 q12, d21, d30
+
+ ; step1[10] * cospi_10_64
+ vmull.s16 q4, d20, d31
+ vmull.s16 q5, d21, d31
+
+ ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+ vmlsl.s16 q11, d26, d31
+ vmlsl.s16 q12, d27, d31
+
+ ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+ vmlal.s16 q4, d26, d30
+ vmlal.s16 q5, d27, d30
+
+ ; preloading to avoid stall
+ ; generate cospi_6_64 = 15679
+ mov r3, #0x3d00
+ add r3, #0x3f
+
+ ; generate cospi_26_64 = 4756
+ mov r12, #0x1200
+ add r12, #0x94
+
+ vdup.16 d30, r3 ; duplicate cospi_6_64
+ vdup.16 d31, r12 ; duplicate cospi_26_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q11, #14 ; >> 14
+ vqrshrn.s32 d5, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d11, q5, #14 ; >> 14
+ vqrshrn.s32 d10, q4, #14 ; >> 14
+
+ ; step1[11] * cospi_6_64
+ vmull.s16 q10, d28, d30
+ vmull.s16 q11, d29, d30
+
+ ; step1[11] * cospi_26_64
+ vmull.s16 q12, d28, d31
+ vmull.s16 q13, d29, d31
+
+ ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+ vmlsl.s16 q10, d18, d31
+ vmlsl.s16 q11, d19, d31
+
+ ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
+ vmlal.s16 q12, d18, d30
+ vmlal.s16 q13, d19, d30
+
+ vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9]
+ vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9]
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q11, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d8, q12, #14 ; >> 14
+ vqrshrn.s32 d9, q13, #14 ; >> 14
+
+ ; stage 3
+ vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11]
+ vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11]
+ vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13]
+ vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13]
+ vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15]
+ vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15]
+
+ ; stage 4
+ ; generate cospi_24_64 = 6270
+ mov r3, #0x1800
+ add r3, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r12, #0x3b00
+ add r12, #0x21
+
+ ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vdup.16 d30, r12 ; duplicate cospi_8_64
+ vdup.16 d31, r3 ; duplicate cospi_24_64
+
+ ; step1[9] * cospi_24_64
+ vmull.s16 q2, d18, d31
+ vmull.s16 q3, d19, d31
+
+ ; step1[14] * cospi_24_64
+ vmull.s16 q4, d28, d31
+ vmull.s16 q5, d29, d31
+
+ ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+ vmlal.s16 q2, d28, d30
+ vmlal.s16 q3, d29, d30
+
+ ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vmlsl.s16 q4, d18, d30
+ vmlsl.s16 q5, d19, d30
+
+ rsb r12, #0
+ vdup.16 d30, r12 ; duplicate -cospi_8_64
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q2, #14 ; >> 14
+ vqrshrn.s32 d13, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q4, #14 ; >> 14
+ vqrshrn.s32 d3, q5, #14 ; >> 14
+
+ vmov.s16 q3, q11
+ vmov.s16 q4, q12
+
+ ; - step1[13] * cospi_8_64
+ vmull.s16 q11, d26, d30
+ vmull.s16 q12, d27, d30
+
+ ; -step1[10] * cospi_8_64
+ vmull.s16 q8, d20, d30
+ vmull.s16 q9, d21, d30
+
+ ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlsl.s16 q11, d20, d31
+ vmlsl.s16 q12, d21, d31
+
+ ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlal.s16 q8, d26, d31
+ vmlal.s16 q9, d27, d31
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d4, q11, #14 ; >> 14
+ vqrshrn.s32 d5, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q8, #14 ; >> 14
+ vqrshrn.s32 d11, q9, #14 ; >> 14
+
+ ; stage 5
+ vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
+ vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
+ vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
+ vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
+ vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
+ vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
+ vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
+ vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
+
+ ; stage 6.
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ vdup.16 d14, r12 ; duplicate cospi_16_64
+
+ ; step1[13] * cospi_16_64
+ vmull.s16 q3, d26, d14
+ vmull.s16 q4, d27, d14
+
+ ; step1[10] * cospi_16_64
+ vmull.s16 q0, d20, d14
+ vmull.s16 q1, d21, d14
+
+ ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+ vsub.s32 q5, q3, q0
+ vsub.s32 q6, q4, q1
+
+ ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+ vadd.s32 q10, q3, q0
+ vadd.s32 q4, q4, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q5, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q10, #14 ; >> 14
+ vqrshrn.s32 d11, q4, #14 ; >> 14
+
+ ; step1[11] * cospi_16_64
+ vmull.s16 q0, d22, d14
+ vmull.s16 q1, d23, d14
+
+ ; step1[12] * cospi_16_64
+ vmull.s16 q13, d24, d14
+ vmull.s16 q6, d25, d14
+
+ ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+ vsub.s32 q10, q13, q0
+ vsub.s32 q4, q6, q1
+
+ ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+ vadd.s32 q13, q13, q0
+ vadd.s32 q6, q6, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d8, q13, #14 ; >> 14
+ vqrshrn.s32 d9, q6, #14 ; >> 14
+
+ mov r4, #16 ; pass1Output stride
+ ldr r3, [sp] ; load skip_adding
+ cmp r3, #0 ; check if need adding dest data
+ beq skip_adding_dest
+
+ ldr r7, [sp, #28] ; dest used to save element 0-7
+ mov r9, r7 ; save dest pointer for later use
+ ldr r8, [sp, #32] ; load dest_stride
+
+ ; stage 7
+ ; load the data in pass1
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q8 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q9, q9, #6
+ vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q9 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q2, q2, #6
+ vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q2 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q3, q3, #6
+ vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q3 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q4, q4, #6
+ vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q4 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q5, q5, #6
+ vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q5 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q14, q14, #6
+ vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q14 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q15, q15, #6
+ vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q15 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ b end_idct16x16_pass2
+
+skip_adding_dest
+ ; stage 7
+ ; load the data in pass1
+ mov r5, #24
+ mov r3, #8
+
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vst1.64 {d24}, [r1], r3 ; store output[0]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[1]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vst1.64 {d24}, [r1], r3 ; store output[2]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[3]
+ vst1.64 {d27}, [r1], r5
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vst1.64 {d24}, [r1], r3 ; store output[4]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[5]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+ vst1.64 {d24}, [r1], r3 ; store output[6]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[7]
+ vst1.64 {d27}, [r1], r5
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vst1.64 {d16}, [r1], r3
+ vst1.64 {d17}, [r1], r5
+ vst1.64 {d18}, [r1], r3
+ vst1.64 {d19}, [r1], r5
+ vst1.64 {d4}, [r1], r3
+ vst1.64 {d5}, [r1], r5
+ vst1.64 {d6}, [r1], r3
+ vst1.64 {d7}, [r1], r5
+ vst1.64 {d8}, [r1], r3
+ vst1.64 {d9}, [r1], r5
+ vst1.64 {d10}, [r1], r3
+ vst1.64 {d11}, [r1], r5
+ vst1.64 {d28}, [r1], r3
+ vst1.64 {d29}, [r1], r5
+ vst1.64 {d30}, [r1], r3
+ vst1.64 {d31}, [r1], r5
+end_idct16x16_pass2
+ pop {r3-r9}
+ bx lr
+ ENDP ; |aom_idct16x16_256_add_neon_pass2|
+
+;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input,
+; int16_t *output, int output_stride)
+;
+; r0 int16_t input
+; r1 int16_t *output
+; r2 int output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|aom_idct16x16_10_add_neon_pass1| PROC
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q1,q2}, [r0]!
+ vmov.s16 q15, q1
+
+ ; generate cospi_28_64*2 = 6392
+ mov r3, #0x1800
+ add r3, #0xf8
+
+ ; generate cospi_4_64*2 = 32138
+ mov r12, #0x7d00
+ add r12, #0x8a
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 q0, r3 ; duplicate cospi_28_64*2
+ vdup.16 q1, r12 ; duplicate cospi_4_64*2
+
+ ; The following instructions use vqrdmulh to do the
+ ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
+ ; double, and return the high 16 bits, effectively giving >> 15. Doubling
+ ; the constant will change this to >> 14.
+ ; dct_const_round_shift(step2[4] * cospi_28_64);
+ vqrdmulh.s16 q4, q9, q0
+
+ ; preloading to avoid stall
+ ; generate cospi_16_64*2 = 23170
+ mov r3, #0x5a00
+ add r3, #0x82
+
+ ; dct_const_round_shift(step2[4] * cospi_4_64);
+ vqrdmulh.s16 q7, q9, q1
+
+ ; stage 4
+ vdup.16 q1, r3 ; cospi_16_64*2
+
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ vdup.16 d4, r3; ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(step1[0] * cospi_16_64)
+ vqrdmulh.s16 q8, q8, q1
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d14, d4
+ vmull.s16 q10, d15, d4
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q12, d9, d4
+ vmull.s16 q11, d8, d4
+
+ ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+ vsub.s32 q15, q10, q12
+ vsub.s32 q6, q9, q11
+
+ ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+ vadd.s32 q9, q9, q11
+ vadd.s32 q10, q10, q12
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d11, q15, #14 ; >> 14
+ vqrshrn.s32 d10, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q10, #14 ; >> 14
+
+ ; stage 6
+ vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7];
+ vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5];
+ vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4];
+ vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6];
+ vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4];
+ vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5];
+ vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6];
+ vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7];
+
+ ; store the data
+ vst1.64 {d4}, [r1], r2
+ vst1.64 {d5}, [r1], r2
+ vst1.64 {d18}, [r1], r2
+ vst1.64 {d19}, [r1], r2
+ vst1.64 {d20}, [r1], r2
+ vst1.64 {d21}, [r1], r2
+ vst1.64 {d22}, [r1], r2
+ vst1.64 {d23}, [r1], r2
+ vst1.64 {d24}, [r1], r2
+ vst1.64 {d25}, [r1], r2
+ vst1.64 {d26}, [r1], r2
+ vst1.64 {d27}, [r1], r2
+ vst1.64 {d28}, [r1], r2
+ vst1.64 {d29}, [r1], r2
+ vst1.64 {d30}, [r1], r2
+ vst1.64 {d31}, [r1], r2
+
+ bx lr
+ ENDP ; |aom_idct16x16_10_add_neon_pass1|
+
+;void aom_idct16x16_10_add_neon_pass2(int16_t *src,
+; int16_t *output,
+; int16_t *pass1Output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t *src
+; r1 int16_t *output,
+; r2 int16_t *pass1Output,
+; r3 int16_t skip_adding,
+; r4 uint8_t *dest,
+; r5 int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|aom_idct16x16_10_add_neon_pass2| PROC
+ push {r3-r9}
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q0,q1}, [r0]!
+ vmov.s16 q15, q0;
+
+ ; generate 2*cospi_30_64 = 3212
+ mov r3, #0xc00
+ add r3, #0x8c
+
+ ; generate 2*cospi_2_64 = 32610
+ mov r12, #0x7f00
+ add r12, #0x62
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 q6, r3 ; duplicate 2*cospi_30_64
+
+ ; dct_const_round_shift(step1[8] * cospi_30_64)
+ vqrdmulh.s16 q0, q8, q6
+
+ vdup.16 q6, r12 ; duplicate 2*cospi_2_64
+
+ ; dct_const_round_shift(step1[8] * cospi_2_64)
+ vqrdmulh.s16 q7, q8, q6
+
+ ; preloading to avoid stall
+ ; generate 2*cospi_26_64 = 9512
+ mov r12, #0x2500
+ add r12, #0x28
+ rsb r12, #0
+ vdup.16 q15, r12 ; duplicate -2*cospi_26_64
+
+ ; generate 2*cospi_6_64 = 31358
+ mov r3, #0x7a00
+ add r3, #0x7e
+ vdup.16 q14, r3 ; duplicate 2*cospi_6_64
+
+ ; dct_const_round_shift(- step1[12] * cospi_26_64)
+ vqrdmulh.s16 q3, q9, q15
+
+ ; dct_const_round_shift(step1[12] * cospi_6_64)
+ vqrdmulh.s16 q4, q9, q14
+
+ ; stage 4
+ ; generate cospi_24_64 = 6270
+ mov r3, #0x1800
+ add r3, #0x7e
+ vdup.16 d31, r3 ; duplicate cospi_24_64
+
+ ; generate cospi_8_64 = 15137
+ mov r12, #0x3b00
+ add r12, #0x21
+ vdup.16 d30, r12 ; duplicate cospi_8_64
+
+ ; step1[14] * cospi_24_64
+ vmull.s16 q12, d14, d31
+ vmull.s16 q5, d15, d31
+
+ ; step1[9] * cospi_24_64
+ vmull.s16 q2, d0, d31
+ vmull.s16 q11, d1, d31
+
+ ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vmlsl.s16 q12, d0, d30
+ vmlsl.s16 q5, d1, d30
+
+ ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+ vmlal.s16 q2, d14, d30
+ vmlal.s16 q11, d15, d30
+
+ rsb r12, #0
+ vdup.16 d30, r12 ; duplicate -cospi_8_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q12, #14 ; >> 14
+ vqrshrn.s32 d3, q5, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q2, #14 ; >> 14
+ vqrshrn.s32 d13, q11, #14 ; >> 14
+
+ ; - step1[13] * cospi_8_64
+ vmull.s16 q10, d8, d30
+ vmull.s16 q13, d9, d30
+
+ ; -step1[10] * cospi_8_64
+ vmull.s16 q8, d6, d30
+ vmull.s16 q9, d7, d30
+
+ ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+ vmlsl.s16 q10, d6, d31
+ vmlsl.s16 q13, d7, d31
+
+ ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlal.s16 q8, d8, d31
+ vmlal.s16 q9, d9, d31
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q10, #14 ; >> 14
+ vqrshrn.s32 d5, q13, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q8, #14 ; >> 14
+ vqrshrn.s32 d11, q9, #14 ; >> 14
+
+ ; stage 5
+ vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
+ vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
+ vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
+ vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
+ vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
+ vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
+ vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
+ vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
+
+ ; stage 6.
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ vdup.16 d14, r12 ; duplicate cospi_16_64
+
+ ; step1[13] * cospi_16_64
+ vmull.s16 q3, d26, d14
+ vmull.s16 q4, d27, d14
+
+ ; step1[10] * cospi_16_64
+ vmull.s16 q0, d20, d14
+ vmull.s16 q1, d21, d14
+
+ ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+ vsub.s32 q5, q3, q0
+ vsub.s32 q6, q4, q1
+
+ ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+ vadd.s32 q0, q3, q0
+ vadd.s32 q1, q4, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q5, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q0, #14 ; >> 14
+ vqrshrn.s32 d11, q1, #14 ; >> 14
+
+ ; step1[11] * cospi_16_64
+ vmull.s16 q0, d22, d14
+ vmull.s16 q1, d23, d14
+
+ ; step1[12] * cospi_16_64
+ vmull.s16 q13, d24, d14
+ vmull.s16 q6, d25, d14
+
+ ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+ vsub.s32 q10, q13, q0
+ vsub.s32 q4, q6, q1
+
+ ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+ vadd.s32 q13, q13, q0
+ vadd.s32 q6, q6, q1
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
+ vqrshrn.s32 d8, q13, #14 ; >> 14
+ vqrshrn.s32 d9, q6, #14 ; >> 14
+
+ mov r4, #16 ; pass1Output stride
+ ldr r3, [sp] ; load skip_adding
+
+ ; stage 7
+ ; load the data in pass1
+ mov r5, #24
+ mov r3, #8
+
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vst1.64 {d24}, [r1], r3 ; store output[0]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[1]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vst1.64 {d24}, [r1], r3 ; store output[2]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[3]
+ vst1.64 {d27}, [r1], r5
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vst1.64 {d24}, [r1], r3 ; store output[4]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[5]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+ vst1.64 {d24}, [r1], r3 ; store output[6]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[7]
+ vst1.64 {d27}, [r1], r5
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vst1.64 {d16}, [r1], r3
+ vst1.64 {d17}, [r1], r5
+ vst1.64 {d18}, [r1], r3
+ vst1.64 {d19}, [r1], r5
+ vst1.64 {d4}, [r1], r3
+ vst1.64 {d5}, [r1], r5
+ vst1.64 {d6}, [r1], r3
+ vst1.64 {d7}, [r1], r5
+ vst1.64 {d8}, [r1], r3
+ vst1.64 {d9}, [r1], r5
+ vst1.64 {d10}, [r1], r3
+ vst1.64 {d11}, [r1], r5
+ vst1.64 {d28}, [r1], r3
+ vst1.64 {d29}, [r1], r5
+ vst1.64 {d30}, [r1], r3
+ vst1.64 {d31}, [r1], r5
+end_idct10_16x16_pass2
+ pop {r3-r9}
+ bx lr
+ ENDP ; |aom_idct16x16_10_add_neon_pass2|
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 000000000..b4cb7a0cd
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
+ int output_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+ d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d18s16, d1s16);
+ q6s32 = vmull_s16(d19s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+ q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+ d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+ d3s16 = vdup_n_s16((int16_t)cospi_20_64);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q5s32, 14);
+ d15s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ q2s32 = vmull_s16(d26s16, d2s16);
+ q3s32 = vmull_s16(d27s16, d2s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q15s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+ d10s16 = vqrshrn_n_s32(q2s32, 14);
+ d11s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q15s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ d30s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d30s16);
+ q11s32 = vmull_s16(d17s16, d30s16);
+ q0s32 = vmull_s16(d24s16, d30s16);
+ q1s32 = vmull_s16(d25s16, d30s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_24_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_8_64);
+
+ q3s32 = vaddq_s32(q2s32, q0s32);
+ q12s32 = vaddq_s32(q11s32, q1s32);
+ q13s32 = vsubq_s32(q2s32, q0s32);
+ q1s32 = vsubq_s32(q11s32, q1s32);
+
+ d16s16 = vqrshrn_n_s32(q3s32, 14);
+ d17s16 = vqrshrn_n_s32(q12s32, 14);
+ d18s16 = vqrshrn_n_s32(q13s32, 14);
+ d19s16 = vqrshrn_n_s32(q1s32, 14);
+ q8s16 = vcombine_s16(d16s16, d17s16);
+ q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q0s32 = vmull_s16(d20s16, d31s16);
+ q1s32 = vmull_s16(d21s16, d31s16);
+ q12s32 = vmull_s16(d20s16, d30s16);
+ q13s32 = vmull_s16(d21s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+ q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+ d22s16 = vqrshrn_n_s32(q0s32, 14);
+ d23s16 = vqrshrn_n_s32(q1s32, 14);
+ d20s16 = vqrshrn_n_s32(q12s32, 14);
+ d21s16 = vqrshrn_n_s32(q13s32, 14);
+ q10s16 = vcombine_s16(d20s16, d21s16);
+ q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q15s16 = vaddq_s16(q6s16, q7s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ // stage 5
+ q0s16 = vaddq_s16(q8s16, q11s16);
+ q1s16 = vaddq_s16(q9s16, q10s16);
+ q2s16 = vsubq_s16(q9s16, q10s16);
+ q3s16 = vsubq_s16(q8s16, q11s16);
+
+ d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q11s32 = vmull_s16(d26s16, d16s16);
+ q12s32 = vmull_s16(d27s16, d16s16);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q13s32 = vsubq_s32(q10s32, q12s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d11s16 = vqrshrn_n_s32(q13s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q8s16 = vaddq_s16(q0s16, q15s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d16u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride) {
+ uint8_t *d;
+ uint8x8_t d12u8, d13u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64;
+ int64x1_t d12s64, d13s64;
+ uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+ uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d12s16 = vdup_n_s16((int16_t)cospi_30_64);
+ d13s16 = vdup_n_s16((int16_t)cospi_2_64);
+
+ q2s32 = vmull_s16(d16s16, d12s16);
+ q3s32 = vmull_s16(d17s16, d12s16);
+ q1s32 = vmull_s16(d16s16, d13s16);
+ q4s32 = vmull_s16(d17s16, d13s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+ q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+ q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+ d0s16 = vqrshrn_n_s32(q2s32, 14);
+ d1s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q1s32, 14);
+ d15s16 = vqrshrn_n_s32(q4s32, 14);
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_14_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_18_64);
+
+ q2s32 = vmull_s16(d24s16, d30s16);
+ q3s32 = vmull_s16(d25s16, d30s16);
+ q4s32 = vmull_s16(d24s16, d31s16);
+ q5s32 = vmull_s16(d25s16, d31s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q2s32, 14);
+ d3s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q4s32, 14);
+ d13s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_22_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_10_64);
+
+ q11s32 = vmull_s16(d20s16, d30s16);
+ q12s32 = vmull_s16(d21s16, d30s16);
+ q4s32 = vmull_s16(d20s16, d31s16);
+ q5s32 = vmull_s16(d21s16, d31s16);
+
+ q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d11s16 = vqrshrn_n_s32(q5s32, 14);
+ d10s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_6_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_26_64);
+
+ q10s32 = vmull_s16(d28s16, d30s16);
+ q11s32 = vmull_s16(d29s16, d30s16);
+ q12s32 = vmull_s16(d28s16, d31s16);
+ q13s32 = vmull_s16(d29s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+ q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+ q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+ q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q11s32, 14);
+ d8s16 = vqrshrn_n_s32(q12s32, 14);
+ d9s16 = vqrshrn_n_s32(q13s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 3
+ q9s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q10s16 = vsubq_s16(q3s16, q2s16);
+ q11s16 = vaddq_s16(q2s16, q3s16);
+ q12s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q6s16, q7s16);
+
+ // stage 4
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_24_64);
+
+ q2s32 = vmull_s16(d18s16, d31s16);
+ q3s32 = vmull_s16(d19s16, d31s16);
+ q4s32 = vmull_s16(d28s16, d31s16);
+ q5s32 = vmull_s16(d29s16, d31s16);
+
+ q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+ q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q3s32, 14);
+ d2s16 = vqrshrn_n_s32(q4s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q3s16 = q11s16;
+ q4s16 = q12s16;
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q11s32 = vmull_s16(d26s16, d30s16);
+ q12s32 = vmull_s16(d27s16, d30s16);
+ q8s32 = vmull_s16(d20s16, d30s16);
+ q9s32 = vmull_s16(d21s16, d30s16);
+
+ q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q10s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q10s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ if (skip_adding != 0) {
+ d = dest;
+ // load the data in pass1
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+ q13u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ // store the data out 8,9,10,11,12,13,14,15
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q2s16 = vrshrq_n_s16(q2s16, 6);
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q3s16 = vrshrq_n_s16(q3s16, 6);
+ q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+ q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q14s16 = vrshrq_n_s16(q14s16, 6);
+ q14u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ q15s16 = vrshrq_n_s16(q15s16, 6);
+ q15u16 =
+ vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ } else { // skip_adding_dest
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ }
+ return;
+}
+
+void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
+ int output_stride) {
+ int16x4_t d4s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // stage 3
+ q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2));
+ q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2));
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ // stage 4
+ q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2));
+ d4s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+ q9s32 = vmull_s16(d14s16, d4s16);
+ q10s32 = vmull_s16(d15s16, d4s16);
+ q12s32 = vmull_s16(d9s16, d4s16);
+ q11s32 = vmull_s16(d8s16, d4s16);
+
+ q15s32 = vsubq_s32(q10s32, q12s32);
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d11s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q2s16 = vaddq_s16(q8s16, q7s16);
+ q9s16 = vaddq_s16(q8s16, q6s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q8s16, q4s16);
+ q12s16 = vsubq_s16(q8s16, q4s16);
+ q13s16 = vsubq_s16(q8s16, q5s16);
+ q14s16 = vsubq_s16(q8s16, q6s16);
+ q15s16 = vsubq_s16(q8s16, q7s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d4u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+ uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+ (void)skip_adding;
+ (void)dest;
+ (void)dest_stride;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // stage 3
+ q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2));
+ q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+ q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2));
+ q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+ q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+ q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2));
+ q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+ q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+ // stage 4
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d6s16 = vget_low_s16(q3s16);
+ d7s16 = vget_high_s16(q3s16);
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+
+ d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+ d31s16 = vdup_n_s16((int16_t)cospi_24_64);
+
+ q12s32 = vmull_s16(d14s16, d31s16);
+ q5s32 = vmull_s16(d15s16, d31s16);
+ q2s32 = vmull_s16(d0s16, d31s16);
+ q11s32 = vmull_s16(d1s16, d31s16);
+
+ q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+ q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q12s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q11s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q10s32 = vmull_s16(d8s16, d30s16);
+ q13s32 = vmull_s16(d9s16, d30s16);
+ q8s32 = vmull_s16(d6s16, d30s16);
+ q9s32 = vmull_s16(d7s16, d30s16);
+
+ q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q10s32, 14);
+ d5s16 = vqrshrn_n_s32(q13s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16((int16_t)cospi_16_64);
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q0s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q0s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+ d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+ d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+ d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+ d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+ d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ vst1_u64((uint64_t *)out, d16u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d4u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d6u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d7u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d8u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d9u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d10u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d11u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c
new file mode 100644
index 000000000..db0d4905b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+
+void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
+ int output_stride);
+void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride);
+void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
+ int output_stride);
+void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
+ int16_t *pass1Output, int16_t skip_adding,
+ uint8_t *dest, int dest_stride);
+
+#if HAVE_NEON_ASM
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void aom_push_neon(int64_t *store);
+extern void aom_pop_neon(int64_t *store);
+#endif // HAVE_NEON_ASM
+
+void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+#if HAVE_NEON_ASM
+ int64_t store_reg[8];
+#endif
+ int16_t pass1_output[16 * 16] = { 0 };
+ int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+ // save d8-d15 register values.
+ aom_push_neon(store_reg);
+#endif
+
+ /* Parallel idct on the upper 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+ dest, dest_stride);
+
+ /* Parallel idct on the lower 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
+ pass1_output, 0, dest, dest_stride);
+
+ /* Parallel idct on the left 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+ pass1_output, 1, dest, dest_stride);
+
+ /* Parallel idct on the right 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+ row_idct_output + 8, pass1_output, 1,
+ dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+ // restore d8-d15 register values.
+ aom_pop_neon(store_reg);
+#endif
+
+ return;
+}
+
+void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+#if HAVE_NEON_ASM
+ int64_t store_reg[8];
+#endif
+ int16_t pass1_output[16 * 16] = { 0 };
+ int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+ // save d8-d15 register values.
+ aom_push_neon(store_reg);
+#endif
+
+ /* Parallel idct on the upper 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+ dest, dest_stride);
+
+ /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+ /* Parallel idct on the left 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+ pass1_output, 1, dest, dest_stride);
+
+ /* Parallel idct on the right 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+ row_idct_output + 8, pass1_output, 1,
+ dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+ // restore d8-d15 register values.
+ aom_pop_neon(store_reg);
+#endif
+
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm
new file mode 100644
index 000000000..b04df2d0b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+ EXPORT |aom_idct32x32_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ;TODO(hkuang): put the following macros in a seperate
+ ;file so other idct function could also use them.
+ MACRO
+ LD_16x8 $src, $stride
+ vld1.8 {q8}, [$src], $stride
+ vld1.8 {q9}, [$src], $stride
+ vld1.8 {q10}, [$src], $stride
+ vld1.8 {q11}, [$src], $stride
+ vld1.8 {q12}, [$src], $stride
+ vld1.8 {q13}, [$src], $stride
+ vld1.8 {q14}, [$src], $stride
+ vld1.8 {q15}, [$src], $stride
+ MEND
+
+ MACRO
+ ADD_DIFF_16x8 $diff
+ vqadd.u8 q8, q8, $diff
+ vqadd.u8 q9, q9, $diff
+ vqadd.u8 q10, q10, $diff
+ vqadd.u8 q11, q11, $diff
+ vqadd.u8 q12, q12, $diff
+ vqadd.u8 q13, q13, $diff
+ vqadd.u8 q14, q14, $diff
+ vqadd.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ SUB_DIFF_16x8 $diff
+ vqsub.u8 q8, q8, $diff
+ vqsub.u8 q9, q9, $diff
+ vqsub.u8 q10, q10, $diff
+ vqsub.u8 q11, q11, $diff
+ vqsub.u8 q12, q12, $diff
+ vqsub.u8 q13, q13, $diff
+ vqsub.u8 q14, q14, $diff
+ vqsub.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ ST_16x8 $dst, $stride
+ vst1.8 {q8}, [$dst], $stride
+ vst1.8 {q9}, [$dst], $stride
+ vst1.8 {q10},[$dst], $stride
+ vst1.8 {q11},[$dst], $stride
+ vst1.8 {q12},[$dst], $stride
+ vst1.8 {q13},[$dst], $stride
+ vst1.8 {q14},[$dst], $stride
+ vst1.8 {q15},[$dst], $stride
+ MEND
+
+;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+
+|aom_idct32x32_1_add_neon| PROC
+ push {lr}
+ pld [r1]
+ add r3, r1, #16 ; r3 dest + 16 for second loop
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asrs r0, r0, #6 ; >> 6
+ bge diff_positive_32_32
+
+diff_negative_32_32
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_negative_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_negative_32_32_loop
+ pop {pc}
+
+diff_positive_32_32
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_positive_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_positive_32_32_loop
+ pop {pc}
+
+ ENDP ; |aom_idct32x32_1_add_neon|
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 000000000..547567c5b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vld1q_u8(d);
+ d += d_stride;
+ *q9u8 = vld1q_u8(d);
+ d += d_stride;
+ *q10u8 = vld1q_u8(d);
+ d += d_stride;
+ *q11u8 = vld1q_u8(d);
+ d += d_stride;
+ *q12u8 = vld1q_u8(d);
+ d += d_stride;
+ *q13u8 = vld1q_u8(d);
+ d += d_stride;
+ *q14u8 = vld1q_u8(d);
+ d += d_stride;
+ *q15u8 = vld1q_u8(d);
+ return;
+}
+
+static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+ uint8x16_t *q9u8, uint8x16_t *q10u8,
+ uint8x16_t *q11u8, uint8x16_t *q12u8,
+ uint8x16_t *q13u8, uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ vst1q_u8(d, *q8u8);
+ d += d_stride;
+ vst1q_u8(d, *q9u8);
+ d += d_stride;
+ vst1q_u8(d, *q10u8);
+ d += d_stride;
+ vst1q_u8(d, *q11u8);
+ d += d_stride;
+ vst1q_u8(d, *q12u8);
+ d += d_stride;
+ vst1q_u8(d, *q13u8);
+ d += d_stride;
+ vst1q_u8(d, *q14u8);
+ d += d_stride;
+ vst1q_u8(d, *q15u8);
+ return;
+}
+
+void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int i, j, dest_stride8;
+ uint8_t *d;
+ int16_t a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ dest_stride8 = dest_stride * 8;
+ if (a1 >= 0) { // diff_positive_32_32
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ } else { // diff_negative_32_32
+ a1 = -a1;
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+ &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm
new file mode 100644
index 000000000..e7793fb16
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm
@@ -0,0 +1,1302 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+; dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64 EQU 16364
+cospi_2_64 EQU 16305
+cospi_3_64 EQU 16207
+cospi_4_64 EQU 16069
+cospi_5_64 EQU 15893
+cospi_6_64 EQU 15679
+cospi_7_64 EQU 15426
+cospi_8_64 EQU 15137
+cospi_9_64 EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU 9760
+cospi_20_64 EQU 9102
+cospi_21_64 EQU 8423
+cospi_22_64 EQU 7723
+cospi_23_64 EQU 7005
+cospi_24_64 EQU 6270
+cospi_25_64 EQU 5520
+cospi_26_64 EQU 4756
+cospi_27_64 EQU 3981
+cospi_28_64 EQU 3196
+cospi_29_64 EQU 2404
+cospi_30_64 EQU 1606
+cospi_31_64 EQU 804
+
+
+ EXPORT |aom_idct32x32_1024_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ AREA Block, CODE, READONLY
+
+ ; --------------------------------------------------------------------------
+ ; Load from transposed_buffer
+ ; q13 = transposed_buffer[first_offset]
+ ; q14 = transposed_buffer[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; transposed_buffer must be passed in. use 0 for first use.
+ MACRO
+ LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+ ; address calculation with proper stride and loading
+ add r0, #($first_offset - $prev_offset )*8*2
+ vld1.s16 {q14}, [r0]
+ add r0, #($second_offset - $first_offset)*8*2
+ vld1.s16 {q13}, [r0]
+ ; (used) two registers (q14, q13)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Load from output (used as temporary storage)
+ ; reg1 = output[first_offset]
+ ; reg2 = output[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, whether reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and loading
+ add r1, #($first_offset - $prev_offset )*32*2
+ vld1.s16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vld1.s16 {$reg2}, [r1]
+ ; (used) two registers ($reg1, $reg2)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Store into output (sometimes as as temporary storage)
+ ; output[first_offset] = reg1
+ ; output[second_offset] = reg2
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, whether reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and storing
+ add r1, #($first_offset - $prev_offset )*32*2
+ vst1.16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vst1.16 {$reg2}, [r1]
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]
+ vst1.16 {d11}, [r9]
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]!
+ vst1.16 {d11}, [r9]!
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]
+ vst1.16 {d4}, [r7]
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]!
+ vst1.16 {d4}, [r7]!
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ ; TODO(cd): have special case to re-use constants when they are similar for
+ ; consecutive butterflies
+ ; TODO(cd): have special case when both constants are the same, do the
+ ; additions/subtractions before the multiplies.
+ ; generate the constants
+ ; generate scalar constants
+ mov r8, #$first_constant & 0xFF00
+ mov r12, #$second_constant & 0xFF00
+ add r8, #$first_constant & 0x00FF
+ add r12, #$second_constant & 0x00FF
+ ; generate vector constants
+ vdup.16 d30, r8
+ vdup.16 d31, r12
+ ; (used) two for inputs (regA-regD), one for constants (q15)
+ ; do some multiplications (ordered for maximum latency hiding)
+ vmull.s16 q8, $regC, d30
+ vmull.s16 q10, $regA, d31
+ vmull.s16 q9, $regD, d30
+ vmull.s16 q11, $regB, d31
+ vmull.s16 q12, $regC, d31
+ ; (used) five for intermediate (q8-q12), one for constants (q15)
+ ; do some addition/subtractions (to get back two register)
+ vsub.s32 q8, q8, q10
+ vsub.s32 q9, q9, q11
+ ; do more multiplications (ordered for maximum latency hiding)
+ vmull.s16 q10, $regD, d31
+ vmull.s16 q11, $regA, d30
+ vmull.s16 q15, $regB, d30
+ ; (used) six for intermediate (q8-q12, q15)
+ ; do more addition/subtractions
+ vadd.s32 q11, q12, q11
+ vadd.s32 q10, q10, q15
+ ; (used) four for intermediate (q8-q11)
+ ; dct_const_round_shift
+ vqrshrn.s32 $reg1, q8, #14
+ vqrshrn.s32 $reg2, q9, #14
+ vqrshrn.s32 $reg3, q11, #14
+ vqrshrn.s32 $reg4, q10, #14
+ ; (used) two for results, well four d registers
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ MEND
+ ; --------------------------------------------------------------------------
+
+;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;
+; r0 int16_t *input,
+; r1 uint8_t *dest,
+; r2 int dest_stride)
+; loop counters
+; r4 bands loop counter
+; r5 pass loop counter
+; r8 transpose loop counter
+; combine-add pointers
+; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
+; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
+; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
+; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
+
+|aom_idct32x32_1024_add_neon| PROC
+ ; This function does one pass of idct32x32 transform.
+ ;
+ ; This is done by transposing the input and then doing a 1d transform on
+ ; columns. In the first pass, the transposed columns are the original
+ ; rows. In the second pass, after the transposition, the colums are the
+ ; original columns.
+ ; The 1d transform is done by looping over bands of eight columns (the
+ ; idct32_bands loop). For each band, the transform input transposition
+ ; is done on demand, one band of four 8x8 matrices at a time. The four
+ ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+ push {r4-r11}
+ vpush {d8-d15}
+ ; stack operation
+ ; internal buffer used to transpose 8 lines into before transforming them
+ ; int16_t transpose_buffer[32 * 8];
+ ; at sp + [4096, 4607]
+ ; results of the first pass (transpose and transform rows)
+ ; int16_t pass1[32 * 32];
+ ; at sp + [0, 2047]
+ ; results of the second pass (transpose and transform columns)
+ ; int16_t pass2[32 * 32];
+ ; at sp + [2048, 4095]
+ sub sp, sp, #512+2048+2048
+
+ ; r6 = dest + 31 * dest_stride
+ ; r7 = dest + 0 * dest_stride
+ ; r9 = dest + 15 * dest_stride
+ ; r10 = dest + 16 * dest_stride
+ rsb r6, r2, r2, lsl #5
+ rsb r9, r2, r2, lsl #4
+ add r10, r1, r2, lsl #4
+ mov r7, r1
+ add r6, r6, r1
+ add r9, r9, r1
+ ; r11 = -dest_stride
+ neg r11, r2
+ ; r3 = input
+ mov r3, r0
+ ; parameters for first pass
+ ; r0 = transpose_buffer[32 * 8]
+ add r0, sp, #4096
+ ; r1 = pass1[32 * 32]
+ mov r1, sp
+
+ mov r5, #0 ; initialize pass loop counter
+idct32_pass_loop
+ mov r4, #4 ; initialize bands loop counter
+idct32_bands_loop
+ mov r8, #2 ; initialize transpose loop counter
+idct32_transpose_pair_loop
+ ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+ ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+ ; adjusted to 32 because of the two post-increments.
+ vld1.s16 {q8}, [r3]!
+ vld1.s16 {q0}, [r3]!
+ add r3, #32
+ vld1.s16 {q9}, [r3]!
+ vld1.s16 {q1}, [r3]!
+ add r3, #32
+ vld1.s16 {q10}, [r3]!
+ vld1.s16 {q2}, [r3]!
+ add r3, #32
+ vld1.s16 {q11}, [r3]!
+ vld1.s16 {q3}, [r3]!
+ add r3, #32
+ vld1.s16 {q12}, [r3]!
+ vld1.s16 {q4}, [r3]!
+ add r3, #32
+ vld1.s16 {q13}, [r3]!
+ vld1.s16 {q5}, [r3]!
+ add r3, #32
+ vld1.s16 {q14}, [r3]!
+ vld1.s16 {q6}, [r3]!
+ add r3, #32
+ vld1.s16 {q15}, [r3]!
+ vld1.s16 {q7}, [r3]!
+
+ ; Transpose the two 8x8 16bit data matrices.
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vswp d1, d8
+ vswp d7, d14
+ vswp d5, d12
+ vswp d3, d10
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; Store both matrices after each other. There is a stride of 32, which
+ ; adjusts to nothing because of the post-increments.
+ vst1.16 {q8}, [r0]!
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+ vst1.16 {q0}, [r0]!
+ vst1.16 {q1}, [r0]!
+ vst1.16 {q2}, [r0]!
+ vst1.16 {q3}, [r0]!
+ vst1.16 {q4}, [r0]!
+ vst1.16 {q5}, [r0]!
+ vst1.16 {q6}, [r0]!
+ vst1.16 {q7}, [r0]!
+
+ ; increment pointers by adjusted stride (not necessary for r0/out)
+ ; go back by 7*32 for the seven lines moved fully by read and add
+ ; go back by 32 for the eigth line only read
+ ; advance by 16*2 to go the next pair
+ sub r3, r3, #7*32*2 + 32 - 16*2
+ ; transpose pair loop processing
+ subs r8, r8, #1
+ bne idct32_transpose_pair_loop
+
+ ; restore r0/input to its original value
+ sub r0, r0, #32*8*2
+
+ ; Instead of doing the transforms stage by stage, it is done by loading
+ ; some input values and doing as many stages as possible to minimize the
+ ; storing/loading of intermediate results. To fit within registers, the
+ ; final coefficients are cut into four blocks:
+ ; BLOCK A: 16-19,28-31
+ ; BLOCK B: 20-23,24-27
+ ; BLOCK C: 8-10,11-15
+ ; BLOCK D: 0-3,4-7
+ ; Blocks A and C are straight calculation through the various stages. In
+ ; block B, further calculations are performed using the results from
+ ; block A. In block D, further calculations are performed using the results
+ ; from block C and then the final calculations are done using results from
+ ; block A and B which have been combined at the end of block B.
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK A: 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; generate 16,17,30,31
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
+ ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
+ ;step1b[16][i] = dct_const_round_shift(temp1);
+ ;step1b[31][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 0, 1, 31
+ DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+ ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+ ;step1b[17][i] = dct_const_round_shift(temp1);
+ ;step1b[30][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 31, 17, 15
+ DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[16] = step1b[16][i] + step1b[17][i];
+ ;step2[17] = step1b[16][i] - step1b[17][i];
+ ;step2[30] = -step1b[30][i] + step1b[31][i];
+ ;step2[31] = step1b[30][i] + step1b[31][i];
+ vadd.s16 q4, q0, q1
+ vsub.s16 q13, q0, q1
+ vadd.s16 q6, q2, q3
+ vsub.s16 q14, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+ ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
+ ;step3[17] = dct_const_round_shift(temp1);
+ ;step3[30] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; generate 18,19,28,29
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+ ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
+ ;step1b[18][i] = dct_const_round_shift(temp1);
+ ;step1b[29][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 15, 9, 23
+ DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
+ ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+ ;step1b[19][i] = dct_const_round_shift(temp1);
+ ;step1b[28][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 23, 25, 7
+ DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[18] = -step1b[18][i] + step1b[19][i];
+ ;step2[19] = step1b[18][i] + step1b[19][i];
+ ;step2[28] = step1b[28][i] + step1b[29][i];
+ ;step2[29] = step1b[28][i] - step1b[29][i];
+ vsub.s16 q13, q3, q2
+ vadd.s16 q3, q3, q2
+ vsub.s16 q14, q1, q0
+ vadd.s16 q2, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
+ ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+ ;step3[29] = dct_const_round_shift(temp1);
+ ;step3[18] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+ ; --------------------------------------------------------------------------
+ ; combine 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[16] = step1b[16][i] + step1b[19][i];
+ ;step1[17] = step1b[17][i] + step1b[18][i];
+ ;step1[18] = step1b[17][i] - step1b[18][i];
+ ;step1[29] = step1b[30][i] - step1b[29][i];
+ ;step1[30] = step1b[30][i] + step1b[29][i];
+ ;step1[31] = step1b[31][i] + step1b[28][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q0
+ vadd.s16 q10, q7, q1
+ vadd.s16 q15, q6, q3
+ vsub.s16 q13, q5, q0
+ vsub.s16 q14, q7, q1
+ STORE_IN_OUTPUT 0, 16, 31, q8, q15
+ STORE_IN_OUTPUT 31, 17, 30, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+ ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
+ ;step2[18] = dct_const_round_shift(temp1);
+ ;step2[29] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+ STORE_IN_OUTPUT 30, 29, 18, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[19] = step1b[16][i] - step1b[19][i];
+ ;step1[28] = step1b[31][i] - step1b[28][i];
+ vsub.s16 q13, q4, q2
+ vsub.s16 q14, q6, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+ ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
+ ;step2[19] = dct_const_round_shift(temp1);
+ ;step2[28] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+ STORE_IN_OUTPUT 18, 19, 28, q4, q6
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK B: 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; generate 20,21,26,27
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+ ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
+ ;step1b[20][i] = dct_const_round_shift(temp1);
+ ;step1b[27][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 7, 5, 27
+ DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+ ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+ ;step1b[21][i] = dct_const_round_shift(temp1);
+ ;step1b[26][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 27, 21, 11
+ DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[20] = step1b[20][i] + step1b[21][i];
+ ;step2[21] = step1b[20][i] - step1b[21][i];
+ ;step2[26] = -step1b[26][i] + step1b[27][i];
+ ;step2[27] = step1b[26][i] + step1b[27][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+ ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+ ;step3[21] = dct_const_round_shift(temp1);
+ ;step3[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 22,23,24,25
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+ ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+ ;step1b[22][i] = dct_const_round_shift(temp1);
+ ;step1b[25][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 11, 13, 19
+ DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
+ ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+ ;step1b[23][i] = dct_const_round_shift(temp1);
+ ;step1b[24][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 19, 29, 3
+ DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[22] = -step1b[22][i] + step1b[23][i];
+ ;step2[23] = step1b[22][i] + step1b[23][i];
+ ;step2[24] = step1b[24][i] + step1b[25][i];
+ ;step2[25] = step1b[24][i] - step1b[25][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+ ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+ ;step3[25] = dct_const_round_shift(temp1);
+ ;step3[22] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[22] = step1b[22][i] + step1b[21][i];
+ ;step1[23] = step1b[23][i] + step1b[20][i];
+ vadd.s16 q10, q7, q1
+ vadd.s16 q11, q5, q0
+ ;step1[24] = step1b[24][i] + step1b[27][i];
+ ;step1[25] = step1b[25][i] + step1b[26][i];
+ vadd.s16 q12, q6, q2
+ vadd.s16 q15, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[16] = step1b[16][i] + step1b[23][i];
+ ;step3[17] = step1b[17][i] + step1b[22][i];
+ ;step3[22] = step1b[17][i] - step1b[22][i];
+ ;step3[23] = step1b[16][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+ vadd.s16 q8, q14, q11
+ vadd.s16 q9, q13, q10
+ vsub.s16 q13, q13, q10
+ vsub.s16 q11, q14, q11
+ STORE_IN_OUTPUT 17, 17, 16, q9, q8
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[24] = step1b[31][i] - step1b[24][i];
+ ;step3[25] = step1b[30][i] - step1b[25][i];
+ ;step3[30] = step1b[30][i] + step1b[25][i];
+ ;step3[31] = step1b[31][i] + step1b[24][i];
+ LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+ vsub.s16 q8, q9, q12
+ vadd.s16 q10, q14, q15
+ vsub.s16 q14, q14, q15
+ vadd.s16 q12, q9, q12
+ STORE_IN_OUTPUT 31, 30, 31, q10, q12
+ ; --------------------------------------------------------------------------
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpush {q8} ; [24]
+ vpush {q11} ; [23]
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+ ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+ ;step1[22] = dct_const_round_shift(temp1);
+ ;step1[25] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 31, 25, 22, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+ ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+ ;step1[23] = dct_const_round_shift(temp1);
+ ;step1[24] = dct_const_round_shift(temp2);
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpop {q13} ; [23]
+ vpop {q14} ; [24]
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 22, 24, 23, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[20] = step1b[23][i] - step1b[20][i];
+ ;step1[27] = step1b[24][i] - step1b[27][i];
+ vsub.s16 q14, q5, q0
+ vsub.s16 q13, q6, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
+ ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+ ;step2[27] = dct_const_round_shift(temp1);
+ ;step2[20] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[21] = step1b[22][i] - step1b[21][i];
+ ;step1[26] = step1b[25][i] - step1b[26][i];
+ vsub.s16 q14, q7, q1
+ vsub.s16 q13, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
+ ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+ ;step2[26] = dct_const_round_shift(temp1);
+ ;step2[21] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[18] = step1b[18][i] + step1b[21][i];
+ ;step3[19] = step1b[19][i] + step1b[20][i];
+ ;step3[20] = step1b[19][i] - step1b[20][i];
+ ;step3[21] = step1b[18][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+ vadd.s16 q8, q14, q1
+ vadd.s16 q9, q13, q6
+ vsub.s16 q13, q13, q6
+ vsub.s16 q1, q14, q1
+ STORE_IN_OUTPUT 19, 18, 19, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[27] = step1b[28][i] - step1b[27][i];
+ ;step3[28] = step1b[28][i] + step1b[27][i];
+ ;step3[29] = step1b[29][i] + step1b[26][i];
+ ;step3[26] = step1b[29][i] - step1b[26][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+ vsub.s16 q14, q8, q5
+ vadd.s16 q10, q8, q5
+ vadd.s16 q11, q9, q0
+ vsub.s16 q0, q9, q0
+ STORE_IN_OUTPUT 29, 28, 29, q10, q11
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+ ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+ ;step1[20] = dct_const_round_shift(temp1);
+ ;step1[27] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 29, 20, 27, q13, q14
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+ ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+ ;step1[21] = dct_const_round_shift(temp1);
+ ;step1[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+ STORE_IN_OUTPUT 27, 21, 26, q1, q0
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK C: 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; generate 8,9,14,15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+ ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+ ;step2[8] = dct_const_round_shift(temp1);
+ ;step2[15] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 3, 2, 30
+ DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+ ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+ ;step2[9] = dct_const_round_shift(temp1);
+ ;step2[14] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 30, 18, 14
+ DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[8] = step1b[8][i] + step1b[9][i];
+ ;step3[9] = step1b[8][i] - step1b[9][i];
+ ;step3[14] = step1b[15][i] - step1b[14][i];
+ ;step3[15] = step1b[15][i] + step1b[14][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+ ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
+ ;step1[9] = dct_const_round_shift(temp1);
+ ;step1[14] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 10,11,12,13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+ ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+ ;step2[10] = dct_const_round_shift(temp1);
+ ;step2[13] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 14, 10, 22
+ DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+ ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+ ;step2[11] = dct_const_round_shift(temp1);
+ ;step2[12] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 22, 26, 6
+ DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[10] = step1b[11][i] - step1b[10][i];
+ ;step3[11] = step1b[11][i] + step1b[10][i];
+ ;step3[12] = step1b[12][i] + step1b[13][i];
+ ;step3[13] = step1b[12][i] - step1b[13][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
+ ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+ ;step1[13] = dct_const_round_shift(temp1);
+ ;step1[10] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[8] = step1b[8][i] + step1b[11][i];
+ ;step2[9] = step1b[9][i] + step1b[10][i];
+ ;step2[10] = step1b[9][i] - step1b[10][i];
+ vadd.s16 q8, q0, q5
+ vadd.s16 q9, q1, q7
+ vsub.s16 q13, q1, q7
+ ;step2[13] = step1b[14][i] - step1b[13][i];
+ ;step2[14] = step1b[14][i] + step1b[13][i];
+ ;step2[15] = step1b[15][i] + step1b[12][i];
+ vsub.s16 q14, q3, q4
+ vadd.s16 q10, q3, q4
+ vadd.s16 q15, q2, q6
+ STORE_IN_OUTPUT 26, 8, 15, q8, q15
+ STORE_IN_OUTPUT 15, 9, 14, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+ ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+ ;step3[10] = dct_const_round_shift(temp1);
+ ;step3[13] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 14, 13, 10, q3, q1
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[11] = step1b[8][i] - step1b[11][i];
+ ;step2[12] = step1b[15][i] - step1b[12][i];
+ vsub.s16 q13, q0, q5
+ vsub.s16 q14, q2, q6
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+ ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+ ;step3[11] = dct_const_round_shift(temp1);
+ ;step3[12] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 10, 11, 12, q1, q3
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK D: 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; generate 4,5,6,7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+ ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+ ;step3[4] = dct_const_round_shift(temp1);
+ ;step3[7] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 6, 4, 28
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+ ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+ ;step3[5] = dct_const_round_shift(temp1);
+ ;step3[6] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 28, 20, 12
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[4] = step1b[4][i] + step1b[5][i];
+ ;step1[5] = step1b[4][i] - step1b[5][i];
+ ;step1[6] = step1b[7][i] - step1b[6][i];
+ ;step1[7] = step1b[7][i] + step1b[6][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+ ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+ ;step2[5] = dct_const_round_shift(temp1);
+ ;step2[6] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 0,1,2,3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+ ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+ ;step1[1] = dct_const_round_shift(temp1);
+ ;step1[0] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 12, 0, 16
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+ ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+ ;step1[2] = dct_const_round_shift(temp1);
+ ;step1[3] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 16, 8, 24
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[0] = step1b[0][i] + step1b[3][i];
+ ;step2[1] = step1b[1][i] + step1b[2][i];
+ ;step2[2] = step1b[1][i] - step1b[2][i];
+ ;step2[3] = step1b[0][i] - step1b[3][i];
+ vadd.s16 q4, q7, q6
+ vsub.s16 q7, q7, q6
+ vsub.s16 q6, q5, q14
+ vadd.s16 q5, q5, q14
+ ; --------------------------------------------------------------------------
+ ; combine 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[0] = step1b[0][i] + step1b[7][i];
+ ;step3[1] = step1b[1][i] + step1b[6][i];
+ ;step3[2] = step1b[2][i] + step1b[5][i];
+ ;step3[3] = step1b[3][i] + step1b[4][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q3
+ vadd.s16 q10, q6, q1
+ vadd.s16 q11, q7, q0
+ ;step3[4] = step1b[3][i] - step1b[4][i];
+ ;step3[5] = step1b[2][i] - step1b[5][i];
+ ;step3[6] = step1b[1][i] - step1b[6][i];
+ ;step3[7] = step1b[0][i] - step1b[7][i];
+ vsub.s16 q12, q7, q0
+ vsub.s16 q13, q6, q1
+ vsub.s16 q14, q5, q3
+ vsub.s16 q15, q4, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[0] = step1b[0][i] + step1b[15][i];
+ ;step1[1] = step1b[1][i] + step1b[14][i];
+ ;step1[14] = step1b[1][i] - step1b[14][i];
+ ;step1[15] = step1b[0][i] - step1b[15][i];
+ LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+ vadd.s16 q2, q8, q1
+ vadd.s16 q3, q9, q0
+ vsub.s16 q4, q9, q0
+ vsub.s16 q5, q8, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+ ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+ ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+ ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+ LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+
+ cmp r5, #0
+ bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+ STORE_IN_OUTPUT 17, 16, 17, q6, q7
+ STORE_IN_OUTPUT 17, 14, 15, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 31, 30, 31, q6, q7
+ STORE_IN_OUTPUT 31, 0, 1, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 19, 18, 19, q6, q7
+ STORE_IN_OUTPUT 19, 12, 13, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 29, 28, 29, q6, q7
+ STORE_IN_OUTPUT 29, 2, 3, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 21, 20, 21, q6, q7
+ STORE_IN_OUTPUT 21, 10, 11, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 27, 26, 27, q6, q7
+ STORE_IN_OUTPUT 27, 4, 5, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 23, 22, 23, q6, q7
+ STORE_IN_OUTPUT 23, 8, 9, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 25, 24, 25, q6, q7
+ STORE_IN_OUTPUT 25, 6, 7, q4, q5
+
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #7*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
+
+ ; bands loop processing
+ subs r4, r4, #1
+ bne idct32_bands_loop
+
+ ; parameters for second pass
+ ; the input of pass2 is the result of pass1. we have to remove the offset
+ ; of 32 columns induced by the above idct32_bands_loop
+ sub r3, r1, #32*2
+ ; r1 = pass2[32 * 32]
+ add r1, sp, #2048
+
+ ; pass loop processing
+ add r5, r5, #1
+ b idct32_pass_loop
+
+idct32_bands_end_2nd_pass
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; restore pointers to their initial indices for next band pass by
+ ; removing/adding dest_stride * 8. The actual increment by eight
+ ; is taken care of within the _LAST macros.
+ add r6, r6, r2, lsl #3
+ add r9, r9, r2, lsl #3
+ sub r7, r7, r2, lsl #3
+ sub r10, r10, r2, lsl #3
+
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #25*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
+
+ ; bands loop processing
+ subs r4, r4, #1
+ bne idct32_bands_loop
+
+ ; stack operation
+ add sp, sp, #512+2048+2048
+ vpop {d8-d15}
+ pop {r4-r11}
+ bx lr
+ ENDP ; |aom_idct32x32_1024_add_neon|
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 000000000..a7562c7d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+ q14s16 = vld1q_s16(trans_buf + first * 8); \
+ q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+ qA = vld1q_s16(out + first * 32); \
+ qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+ vst1q_s16(out + first * 32, qA); \
+ vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
+ int stride, int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16) {
+ int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+ d8s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d11s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d9s16 = vld1_s16((int16_t *)p1);
+ d10s16 = vld1_s16((int16_t *)p2);
+
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+
+ q7s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
+ q8s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
+ q9s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
+ q6s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+
+ d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+ d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+ d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+ vst1_s16((int16_t *)p1, d9s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d10s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p1, d8s16);
+ vst1_s16((int16_t *)p2, d11s16);
+ return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
+ ; \
+ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
+ int stride, int16x8_t q4s16,
+ int16x8_t q5s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16) {
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+ d4s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d7s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d5s16 = vld1_s16((int16_t *)p1);
+ d6s16 = vld1_s16((int16_t *)p2);
+
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+
+ q5s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
+ q6s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
+ q7s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
+ q4s16 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+
+ d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+ d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+ d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+ vst1_s16((int16_t *)p1, d5s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d6s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p2, d7s16);
+ vst1_s16((int16_t *)p1, d4s16);
+ return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
+ int16_t first_const, int16_t second_const,
+ int16x8_t *qAs16, int16x8_t *qBs16) {
+ int16x4_t d30s16, d31s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+ int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+ dCs16 = vget_low_s16(q14s16);
+ dDs16 = vget_high_s16(q14s16);
+ dAs16 = vget_low_s16(q13s16);
+ dBs16 = vget_high_s16(q13s16);
+
+ d30s16 = vdup_n_s16(first_const);
+ d31s16 = vdup_n_s16(second_const);
+
+ q8s32 = vmull_s16(dCs16, d30s16);
+ q10s32 = vmull_s16(dAs16, d31s16);
+ q9s32 = vmull_s16(dDs16, d30s16);
+ q11s32 = vmull_s16(dBs16, d31s16);
+ q12s32 = vmull_s16(dCs16, d31s16);
+
+ q8s32 = vsubq_s32(q8s32, q10s32);
+ q9s32 = vsubq_s32(q9s32, q11s32);
+
+ q10s32 = vmull_s16(dDs16, d31s16);
+ q11s32 = vmull_s16(dAs16, d30s16);
+ q15s32 = vmull_s16(dBs16, d30s16);
+
+ q11s32 = vaddq_s32(q12s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q15s32);
+
+ *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+ return;
+}
+
+static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
+ int16_t *in;
+ int i;
+ const int stride = 32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ in = input;
+ q8s16 = vld1q_s16(in);
+ in += stride;
+ q9s16 = vld1q_s16(in);
+ in += stride;
+ q10s16 = vld1q_s16(in);
+ in += stride;
+ q11s16 = vld1q_s16(in);
+ in += stride;
+ q12s16 = vld1q_s16(in);
+ in += stride;
+ q13s16 = vld1q_s16(in);
+ in += stride;
+ q14s16 = vld1q_s16(in);
+ in += stride;
+ q15s16 = vld1q_s16(in);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ q12s16 = vcombine_s16(d17s16, d25s16);
+ q13s16 = vcombine_s16(d19s16, d27s16);
+ q14s16 = vcombine_s16(d21s16, d29s16);
+ q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ vst1q_s16(t_buf, q0x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q0x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[1]);
+ t_buf += 8;
+ }
+ return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
+ int16x8_t q3s16, int16x8_t q6s16,
+ int16x8_t q7s16, int16x8_t q8s16,
+ int16x8_t q9s16, int16x8_t q10s16,
+ int16x8_t q11s16, int16x8_t q12s16,
+ int16x8_t q13s16, int16x8_t q14s16,
+ int16x8_t q15s16) {
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+ STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+ STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+ STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+ STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+ STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+ STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+ STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+ STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+ return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+ int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
+ int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
+ int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
+ int16x8_t q14s16, int16x8_t q15s16) {
+ uint8_t *r6 = dest + 31 * stride;
+ uint8_t *r7 = dest /* + 0 * stride*/;
+ uint8_t *r9 = dest + 15 * stride;
+ uint8_t *r10 = dest + 16 * stride;
+ int str2 = stride << 1;
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2;
+ r9 -= str2;
+
+ LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2;
+ r6 -= str2;
+
+ LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+ LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ return;
+}
+
+void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ int16_t *out;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++,
+ input = pass1, // the input of pass2 is the result of pass1
+ out = pass2) {
+ for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(0, 1, 31)
+ DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(31, 17, 15)
+ DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+ // part of stage 2
+ q4s16 = vaddq_s16(q0s16, q1s16);
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q6s16 = vaddq_s16(q2s16, q3s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+ // generate 18,19,28,29
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(15, 9, 23)
+ DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(23, 25, 7)
+ DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q3s16, q2s16);
+ q3s16 = vaddq_s16(q3s16, q2s16);
+ q14s16 = vsubq_s16(q1s16, q0s16);
+ q2s16 = vaddq_s16(q1s16, q0s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+ // part of stage 4
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q15s16 = vaddq_s16(q6s16, q3s16);
+ q13s16 = vsubq_s16(q5s16, q0s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+ STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+ STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q4s16, q2s16);
+ q14s16 = vsubq_s16(q6s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+ STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(7, 5, 27)
+ DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(27, 21, 11)
+ DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+ // generate 22,23,24,25
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(11, 13, 19)
+ DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(19, 29, 3)
+ DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+ // part of stage 2
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+ // part of stage 4
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q11s16 = vaddq_s16(q5s16, q0s16);
+ q12s16 = vaddq_s16(q6s16, q2s16);
+ q15s16 = vaddq_s16(q4s16, q3s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q11s16);
+ q9s16 = vaddq_s16(q13s16, q10s16);
+ q13s16 = vsubq_s16(q13s16, q10s16);
+ q11s16 = vsubq_s16(q14s16, q11s16);
+ STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+ LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+ q8s16 = vsubq_s16(q9s16, q12s16);
+ q10s16 = vaddq_s16(q14s16, q15s16);
+ q14s16 = vsubq_s16(q14s16, q15s16);
+ q12s16 = vaddq_s16(q9s16, q12s16);
+ STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+ q13s16 = q11s16;
+ q14s16 = q8s16;
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+ // part of stage 4
+ q14s16 = vsubq_s16(q5s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q2s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ q13s16 = vsubq_s16(q4s16, q3s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q1s16);
+ q9s16 = vaddq_s16(q13s16, q6s16);
+ q13s16 = vsubq_s16(q13s16, q6s16);
+ q1s16 = vsubq_s16(q14s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+ LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+ q14s16 = vsubq_s16(q8s16, q5s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q9s16, q0s16);
+ q0s16 = vsubq_s16(q9s16, q0s16);
+ STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+ DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
+ STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(3, 2, 30)
+ DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(30, 18, 14)
+ DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+ // part of stage 3
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+ // generate 10,11,12,13
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(14, 10, 22)
+ DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(22, 26, 6)
+ DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+ // part of stage 3
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+ // part of stage 5
+ q8s16 = vaddq_s16(q0s16, q5s16);
+ q9s16 = vaddq_s16(q1s16, q7s16);
+ q13s16 = vsubq_s16(q1s16, q7s16);
+ q14s16 = vsubq_s16(q3s16, q4s16);
+ q10s16 = vaddq_s16(q3s16, q4s16);
+ q15s16 = vaddq_s16(q2s16, q6s16);
+ STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+ STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+ // part of stage 6
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+ q13s16 = vsubq_s16(q0s16, q5s16);
+ q14s16 = vsubq_s16(q2s16, q6s16);
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ LOAD_FROM_TRANSPOSED(6, 4, 28)
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(28, 20, 12)
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+ // generate 0,1,2,3
+ // part of stage 4
+ LOAD_FROM_TRANSPOSED(12, 0, 16)
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(16, 8, 24)
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+ // part of stage 5
+ q4s16 = vaddq_s16(q7s16, q6s16);
+ q7s16 = vsubq_s16(q7s16, q6s16);
+ q6s16 = vsubq_s16(q5s16, q14s16);
+ q5s16 = vaddq_s16(q5s16, q14s16);
+ // part of stage 6
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q3s16);
+ q10s16 = vaddq_s16(q6s16, q1s16);
+ q11s16 = vaddq_s16(q7s16, q0s16);
+ q12s16 = vsubq_s16(q7s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q1s16);
+ q14s16 = vsubq_s16(q5s16, q3s16);
+ q15s16 = vsubq_s16(q4s16, q2s16);
+ // part of stage 7
+ LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+ q2s16 = vaddq_s16(q8s16, q1s16);
+ q3s16 = vaddq_s16(q9s16, q0s16);
+ q4s16 = vsubq_s16(q9s16, q0s16);
+ q5s16 = vsubq_s16(q8s16, q1s16);
+ LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16,
+ q15s16);
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
+ q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+ q14s16, q15s16);
+ dest += 8;
+ }
+ }
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm
new file mode 100644
index 000000000..6bd733d5d
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm
@@ -0,0 +1,71 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+
+ EXPORT |aom_idct4x4_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct4x4_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 4)
+ add r0, r0, #8 ; + (1 <<((4) - 1))
+ asr r0, r0, #4 ; >> 4
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q8, q0, d2 ; dest[x] + a1
+ vaddw.u8 q9, q0, d4
+
+ vqmovun.s16 d6, q8 ; clip_pixel
+ vqmovun.s16 d7, q9
+
+ vst1.32 {d6[0]}, [r12], r2
+ vst1.32 {d6[1]}, [r12], r2
+ vst1.32 {d7[0]}, [r12], r2
+ vst1.32 {d7[1]}, [r12]
+
+ bx lr
+ ENDP ; |aom_idct4x4_1_add_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 000000000..3df7a901b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d6u8;
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint16x8_t q8u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ q0s16 = vdupq_n_s16(a1);
+
+ // dc_only_idct_add
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+ d1 += dest_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
+ d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+ d2 += dest_stride;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm
new file mode 100644
index 000000000..127acf614
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm
@@ -0,0 +1,193 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_idct4x4_16_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct4x4_16_add_neon| PROC
+
+ ; The 2D transform is done with two passes which are actually pretty
+ ; similar. We first transform the rows. This is done by transposing
+ ; the inputs, doing an SIMD column transform (the columns are the
+ ; transposed rows) and then transpose the results (so that it goes back
+ ; in normal/row positions). Then, we transform the columns by doing
+ ; another SIMD column transform.
+ ; So, two passes of a transpose followed by a column transform.
+
+ ; load the inputs into q8-q9, d16-d19
+ vld1.s16 {q8,q9}, [r0]!
+
+ ; generate scalar constants
+ ; cospi_8_64 = 15137 = 0x3b21
+ mov r0, #0x3b00
+ add r0, #0x21
+ ; cospi_16_64 = 11585 = 0x2d41
+ mov r3, #0x2d00
+ add r3, #0x41
+ ; cospi_24_64 = 6270 = 0x 187e
+ mov r12, #0x1800
+ add r12, #0x7e
+
+ ; transpose the input data
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ ; generate constant vectors
+ vdup.16 d20, r0 ; replicate cospi_8_64
+ vdup.16 d21, r3 ; replicate cospi_16_64
+
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ vdup.16 d22, r12 ; replicate cospi_24_64
+
+ ; do the transform on transposed rows
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d26, q13, #14
+ vqrshrn.s32 d27, q14, #14
+ vqrshrn.s32 d29, q15, #14
+ vqrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+ vswp d18, d19
+
+ ; transpose the results
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ ; do the transform on columns
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d26, q13, #14
+ vqrshrn.s32 d27, q14, #14
+ vqrshrn.s32 d29, q15, #14
+ vqrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+
+ ; The results are in two registers, one of them being swapped. This will
+ ; be taken care of by loading the 'dest' value in a swapped fashion and
+ ; also storing them in the same swapped fashion.
+ ; temp_out[0, 1] = d16, d17 = q8
+ ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+ vld1.32 {d26[0]}, [r1], r2
+ vld1.32 {d26[1]}, [r1], r2
+ vld1.32 {d27[1]}, [r1], r2
+ vld1.32 {d27[0]}, [r1] ; no post-increment
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d26
+ vaddw.u8 q9, q9, d27
+
+ ; clip_pixel
+ vqmovun.s16 d26, q8
+ vqmovun.s16 d27, q9
+
+ ; do the stores in reverse order with negative post-increment, by changing
+ ; the sign of the stride
+ rsb r2, r2, #0
+ vst1.32 {d27[0]}, [r1], r2
+ vst1.32 {d27[1]}, [r1], r2
+ vst1.32 {d26[1]}, [r1], r2
+ vst1.32 {d26[0]}, [r1] ; no post-increment
+ bx lr
+ ENDP ; |aom_idct4x4_16_add_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 000000000..763be1ab0
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d26u8, d27u8;
+ uint32x2_t d26u32, d27u32;
+ uint16x8_t q8u16, q9u16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+ int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+ int16x8_t q8s16, q9s16, q13s16, q14s16;
+ int32x4_t q1s32, q13s32, q14s32, q15s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+ uint8_t *d;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ d20s16 = vdup_n_s16((int16_t)cospi_8_64);
+ d21s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ d22s16 = vdup_n_s16((int16_t)cospi_24_64);
+
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_high_s16(q9s16); // vswp d18 d19
+ d19s16 = vget_low_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ // do the transform on columns
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d = dest;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+ d += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ d = dest;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm
new file mode 100644
index 000000000..ec07e2053
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm
@@ -0,0 +1,91 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+
+ EXPORT |aom_idct8x8_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct8x8_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 5)
+ add r0, r0, #16 ; + (1 <<((5) - 1))
+ asr r0, r0, #5 ; >> 5
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ ; load destination data
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r2
+ vld1.64 {d17}, [r1]
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |aom_idct8x8_1_add_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 000000000..c7926f9e4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm
new file mode 100644
index 000000000..f3d5f246d
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm
@@ -0,0 +1,522 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_idct8x8_64_add_neon|
+ EXPORT |aom_idct8x8_12_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+ ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+ ; This macro will touch q0-q7 registers and use them as buffer during
+ ; calculation.
+ MACRO
+ IDCT8x8_1D
+ ; stage 1
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r4 ; duplicate cospi_4_64
+ vdup.16 d2, r5 ; duplicate cospi_12_64
+ vdup.16 d3, r6 ; duplicate cospi_20_64
+
+ ; input[1] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; input[5] * cospi_12_64
+ vmull.s16 q5, d26, d2
+ vmull.s16 q6, d27, d2
+
+ ; input[1]*cospi_28_64-input[7]*cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q5, d22, d3
+ vmlsl.s16 q6, d23, d3
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; input[1] * cospi_4_64
+ vmull.s16 q2, d18, d1
+ vmull.s16 q3, d19, d1
+
+ ; input[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q13, d27, d3
+
+ ; input[1]*cospi_4_64+input[7]*cospi_28_64
+ vmlal.s16 q2, d30, d0
+ vmlal.s16 q3, d31, d0
+
+ ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q13, d23, d2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d14, q2, #14 ; >> 14
+ vqrshrn.s32 d15, q3, #14 ; >> 14
+
+ ; stage 2 & stage 3 - even half
+ vdup.16 d0, r7 ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q13, #14 ; >> 14
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q13, d16, d0
+ vmull.s16 q15, d17, d0
+
+ ; (input[0] + input[2]) * cospi_16_64
+ vmlal.s16 q2, d24, d0
+ vmlal.s16 q3, d25, d0
+
+ ; (input[0] - input[2]) * cospi_16_64
+ vmlsl.s16 q13, d24, d0
+ vmlsl.s16 q15, d25, d0
+
+ vdup.16 d0, r8 ; duplicate cospi_24_64
+ vdup.16 d1, r9 ; duplicate cospi_8_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d18, q2, #14 ; >> 14
+ vqrshrn.s32 d19, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d22, q13, #14 ; >> 14
+ vqrshrn.s32 d23, q15, #14 ; >> 14
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ ; input[1] * cospi_24_64
+ vmull.s16 q2, d20, d0
+ vmull.s16 q3, d21, d0
+
+ ; input[1] * cospi_8_64
+ vmull.s16 q8, d20, d1
+ vmull.s16 q12, d21, d1
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q2, d28, d1
+ vmlsl.s16 q3, d29, d1
+
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q8, d28, d0
+ vmlal.s16 q12, d29, d0
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d26, q2, #14 ; >> 14
+ vqrshrn.s32 d27, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d30, q8, #14 ; >> 14
+ vqrshrn.s32 d31, q12, #14 ; >> 14
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+ MEND
+
+ ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct8x8_64_add_neon| PROC
+ push {r4-r9}
+ vpush {d8-d15}
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+
+ ; First transform rows
+ IDCT8x8_1D
+
+ ; Transpose the matrix
+ TRANSPOSE8X8
+
+ ; Then transform columns
+ IDCT8x8_1D
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+ ENDP ; |aom_idct8x8_64_add_neon|
+
+;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|aom_idct8x8_12_add_neon| PROC
+ push {r4-r9}
+ vpush {d8-d15}
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+
+ ; First transform rows
+ ; stage 1
+ ; The following instructions use vqrdmulh to do the
+ ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+ ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+ ; to double the constants before multiplying to compensate this.
+ mov r12, r3, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_28_64*2
+ mov r12, r4, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_4_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_28_64)
+ vqrdmulh.s16 q4, q9, q0
+
+ mov r12, r6, lsl #1
+ rsb r12, #0
+ vdup.16 q0, r12 ; duplicate -cospi_20_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_4_64)
+ vqrdmulh.s16 q7, q9, q1
+
+ mov r12, r5, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_12_64*2
+
+ ; dct_const_round_shift(- input[3] * cospi_20_64)
+ vqrdmulh.s16 q5, q11, q0
+
+ mov r12, r7, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_16_64*2
+
+ ; dct_const_round_shift(input[3] * cospi_12_64)
+ vqrdmulh.s16 q6, q11, q1
+
+ ; stage 2 & stage 3 - even half
+ mov r12, r8, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_24_64*2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrdmulh.s16 q9, q8, q0
+
+ mov r12, r9, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_8_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_24_64)
+ vqrdmulh.s16 q13, q10, q1
+
+ ; dct_const_round_shift(input[1] * cospi_8_64)
+ vqrdmulh.s16 q15, q10, q0
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+
+ ; Transpose the matrix
+ TRANSPOSE8X8
+
+ ; Then transform columns
+ IDCT8x8_1D
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+ ENDP ; |aom_idct8x8_12_add_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 000000000..8ad70862d
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+ int16x8_t *q10s16, int16x8_t *q11s16,
+ int16x8_t *q12s16, int16x8_t *q13s16,
+ int16x8_t *q14s16, int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+ d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+ d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+ d3s16 = vdup_n_s16((int16_t)cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16((int16_t)cospi_24_64);
+ d1s16 = vdup_n_s16((int16_t)cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
+
+void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ // First transform rows
+ // stage 1
+ q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+ q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
+
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
+
+ q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+ q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
+
+ q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+ // stage 2 & stage 3 - even half
+ q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
+
+ q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+ q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
+
+ q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+ q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+ // stage 3 -odd half
+ q0s16 = vaddq_s16(q9s16, q15s16);
+ q1s16 = vaddq_s16(q9s16, q13s16);
+ q2s16 = vsubq_s16(q9s16, q13s16);
+ q3s16 = vsubq_s16(q9s16, q15s16);
+
+ // stage 2 - odd half
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ q8s16 = vaddq_s16(q0s16, q7s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q7s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+ &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 000000000..2dc5b2e56
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_top = vcombine_u16(p1, p1);
+ }
+
+ if (do_left) {
+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_left = vcombine_u16(p1, p1);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 3);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 2);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 2);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+ }
+ }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_top = vcombine_u16(p2, p2);
+ }
+
+ if (do_left) {
+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_left = vcombine_u16(p2, p2);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 4);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 3);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 3);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
+ }
+ }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x16_t A = vld1q_u8(above); // top row
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_top = vcombine_u16(p3, p3);
+ }
+
+ if (do_left) {
+ const uint8x16_t L = vld1q_u8(left); // left row
+ const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_left = vcombine_u16(p3, p3);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 5);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 4);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 4);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ }
+ }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x16_t A0 = vld1q_u8(above); // top row
+ const uint8x16_t A1 = vld1q_u8(above + 16);
+ const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
+ const uint16x8_t p1 = vpaddlq_u8(A1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_top = vcombine_u16(p5, p5);
+ }
+
+ if (do_left) {
+ const uint8x16_t L0 = vld1q_u8(left); // left row
+ const uint8x16_t L1 = vld1q_u8(left + 16);
+ const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
+ const uint16x8_t p1 = vpaddlq_u8(L1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_left = vcombine_u16(p5, p5);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 6);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 5);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 5);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 32; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ }
+ }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+ const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+ const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+ const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+ const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+ const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+ const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+ const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+ const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+ const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+ const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+ vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
+
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
+
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
+
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
+
+ d1u64 = vld1_u64((const uint64_t *)left);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ }
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ for (k = 0; k < 2; k++, left += 16) {
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ }
+ }
+}
+
+void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+
+ d0u8 = vld1_dup_u8(above - 1);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
+}
+
+void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
+
+ d0u8 = vld1_dup_u8(above - 1);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ }
+}
+
+void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+ q0u8 = vld1q_dup_u8(above - 1);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+ q11s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+ q8s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+ q0s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+ q11s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+ q8s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += stride;
+ }
+ }
+}
+
+void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+ q0u8 = vld1q_dup_u8(above - 1);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+ q13s16 =
+ vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+ }
+ }
+}
+#endif // !HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 000000000..7d04d3553
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,633 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_v_predictor_4x4_neon|
+ EXPORT |aom_v_predictor_8x8_neon|
+ EXPORT |aom_v_predictor_16x16_neon|
+ EXPORT |aom_v_predictor_32x32_neon|
+ EXPORT |aom_h_predictor_4x4_neon|
+ EXPORT |aom_h_predictor_8x8_neon|
+ EXPORT |aom_h_predictor_16x16_neon|
+ EXPORT |aom_h_predictor_32x32_neon|
+ EXPORT |aom_tm_predictor_4x4_neon|
+ EXPORT |aom_tm_predictor_8x8_neon|
+ EXPORT |aom_tm_predictor_16x16_neon|
+ EXPORT |aom_tm_predictor_32x32_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_v_predictor_4x4_neon| PROC
+ vld1.32 {d0[0]}, [r2]
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |aom_v_predictor_4x4_neon|
+
+;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_v_predictor_8x8_neon| PROC
+ vld1.8 {d0}, [r2]
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ bx lr
+ ENDP ; |aom_v_predictor_8x8_neon|
+
+;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_v_predictor_16x16_neon| PROC
+ vld1.8 {q0}, [r2]
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |aom_v_predictor_16x16_neon|
+
+;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_v_predictor_32x32_neon| PROC
+ vld1.8 {q0, q1}, [r2]
+ mov r2, #2
+loop_v
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_v
+ bx lr
+ ENDP ; |aom_v_predictor_32x32_neon|
+
+;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_h_predictor_4x4_neon| PROC
+ vld1.32 {d1[0]}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |aom_h_predictor_4x4_neon|
+
+;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_h_predictor_8x8_neon| PROC
+ vld1.64 {d1}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[4]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[5]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[6]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[7]
+ vst1.64 {d0}, [r0], r1
+ bx lr
+ ENDP ; |aom_h_predictor_8x8_neon|
+
+;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_h_predictor_16x16_neon| PROC
+ vld1.8 {q1}, [r3]
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |aom_h_predictor_16x16_neon|
+
+;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_h_predictor_32x32_neon| PROC
+ sub r1, r1, #16
+ mov r2, #2
+loop_h
+ vld1.8 {q1}, [r3]!
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_h
+ bx lr
+ ENDP ; |aom_h_predictor_32x32_neon|
+
+;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_tm_predictor_4x4_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.u8 {d0[]}, [r12]
+
+ ; Load above 4 pixels
+ vld1.32 {d2[0]}, [r2]
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]!
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+
+ ; 3rd row and 4th row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ bx lr
+ ENDP ; |aom_tm_predictor_4x4_neon|
+
+;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_tm_predictor_8x8_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; preload 8 left
+ vld1.8 {d30}, [r3]
+
+ ; Load above 8 pixels
+ vld1.64 {d2}, [r2]
+
+ vmovl.u8 q10, d30
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 3rd row and 4th row
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ ; 5th row and 6th row
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ bx lr
+ ENDP ; |aom_tm_predictor_8x8_neon|
+
+;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_tm_predictor_16x16_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 8 pixels
+ vld1.8 {q1}, [r2]
+
+ ; preload 8 left into r12
+ vld1.8 {d18}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d0
+
+ vmovl.u8 q10, d18
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+ mov r2, #2
+
+loop_16x16_neon
+ ; Process two rows.
+ vdup.16 q0, d20[0]
+ vdup.16 q8, d20[1]
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d20[2] ; proload next 2 rows data
+ vdup.16 q8, d20[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[0] ; proload next 2 rows data
+ vdup.16 q8, d21[1]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[2] ; proload next 2 rows data
+ vdup.16 q8, d21[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vld1.8 {d18}, [r3]! ; preload 8 left into r12
+ vmovl.u8 q10, d18
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_16x16_neon
+
+ bx lr
+ ENDP ; |aom_tm_predictor_16x16_neon|
+
+;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|aom_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 32 pixels
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 {d26}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d0
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d0
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vld1.8 {d0}, [r3]! ; preload 8 left pixels
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |aom_tm_predictor_32x32_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm
new file mode 100644
index 000000000..b6e2c9edb
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,202 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_lpf_horizontal_4_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|aom_lpf_horizontal_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ add r1, r1, r1 ; double pitch
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, r1, lsl #1 ; s[-4 * p]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ add r3, r2, r1, lsr #1 ; s[-3 * p]
+
+ vld1.u8 {q3}, [r2@64], r1 ; p3
+ vld1.u8 {q4}, [r3@64], r1 ; p2
+ vld1.u8 {q5}, [r2@64], r1 ; p1
+ vld1.u8 {q6}, [r3@64], r1 ; p0
+ vld1.u8 {q7}, [r2@64], r1 ; q0
+ vld1.u8 {q8}, [r3@64], r1 ; q1
+ vld1.u8 {q9}, [r2@64] ; q2
+ vld1.u8 {q10}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl aom_loop_filter_neon_16
+
+ vst1.u8 {q5}, [r2@64], r1 ; store op1
+ vst1.u8 {q6}, [r3@64], r1 ; store op0
+ vst1.u8 {q7}, [r2@64], r1 ; store oq0
+ vst1.u8 {q8}, [r3@64], r1 ; store oq1
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |aom_lpf_horizontal_4_dual_neon|
+
+; void aom_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0 blimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+;
+; Outputs:
+; q5 op1
+; q6 op0
+; q7 oq0
+; q8 oq1
+|aom_loop_filter_neon_16| PROC
+
+ ; filter_mask
+ vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 q11, q11, q12 ; m7 = max(m1, m2)
+ vmax.u8 q12, q13, q14 ; m8 = max(m3, m4)
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ vmax.u8 q3, q3, q4 ; m9 = max(m5, m6)
+
+ vmov.u8 q10, #0x80
+
+ vmax.u8 q15, q11, q12 ; m10 = max(m7, m8)
+
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3 ; m11 = max(m10, m9)
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ veor q7, q7, q10 ; qs0
+
+ vcge.u8 q15, q1, q15 ; abs(m11) > limit
+
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u16 q4, #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; a > blimit
+
+ vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; hev
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; filter &= hev
+ vand q15, q15, q9 ; mask
+
+ vmov.u8 q4, #3
+
+ vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0)
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; filter &= mask
+
+ vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3)
+ vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4)
+ vshr.s8 q2, q2, #3 ; filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2)
+ vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1
+
+ veor q7, q0, q10 ; *oq0 = u^0x80
+
+ vbic q1, q1, q14 ; filter &= ~hev
+
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter)
+
+ veor q6, q11, q10 ; *op0 = u^0x80
+ veor q5, q13, q10 ; *op1 = u^0x80
+ veor q8, q12, q10 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |aom_loop_filter_neon_16|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
new file mode 100644
index 000000000..c0562a6ea
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // aom_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // aom_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vdupq_n_u16(3);
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q4 = vdupq_n_u8(3);
+ q9 = vdupq_n_u8(4);
+ // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+ return;
+}
+
+void aom_lpf_horizontal_4_dual_neon(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+ uint8x16_t qblimit, qlimit, qthresh;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+ dblimit0 = vld1_u8(blimit0);
+ dlimit0 = vld1_u8(limit0);
+ dthresh0 = vld1_u8(thresh0);
+ dblimit1 = vld1_u8(blimit1);
+ dlimit1 = vld1_u8(limit1);
+ dthresh1 = vld1_u8(thresh1);
+ qblimit = vcombine_u8(dblimit0, dblimit1);
+ qlimit = vcombine_u8(dlimit0, dlimit1);
+ qthresh = vcombine_u8(dthresh0, dthresh1);
+
+ s -= (p << 2);
+
+ q3u8 = vld1q_u8(s);
+ s += p;
+ q4u8 = vld1q_u8(s);
+ s += p;
+ q5u8 = vld1q_u8(s);
+ s += p;
+ q6u8 = vld1q_u8(s);
+ s += p;
+ q7u8 = vld1q_u8(s);
+ s += p;
+ q8u8 = vld1q_u8(s);
+ s += p;
+ q9u8 = vld1q_u8(s);
+ s += p;
+ q10u8 = vld1q_u8(s);
+
+ loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
+ q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
+
+ s -= (p * 5);
+ vst1q_u8(s, q5u8);
+ s += p;
+ vst1q_u8(s, q6u8);
+ s += p;
+ vst1q_u8(s, q7u8);
+ s += p;
+ vst1q_u8(s, q8u8);
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 000000000..8b54984d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,252 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_lpf_horizontal_4_neon|
+ EXPORT |aom_lpf_vertical_4_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently aom only works on iterations 8 at a time. The aom loop filter
+; works on 16 iterations at a time.
+;
+; void aom_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|aom_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl aom_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ pop {pc}
+ ENDP ; |aom_lpf_horizontal_4_neon|
+
+; Currently aom only works on iterations 8 at a time. The aom loop filter
+; works on 16 iterations at a time.
+;
+; void aom_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|aom_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl aom_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ pop {pc}
+ ENDP ; |aom_lpf_vertical_4_neon|
+
+; void aom_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|aom_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |aom_loop_filter_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
new file mode 100644
index 000000000..2b1f80b81
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < 1; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm
new file mode 100644
index 000000000..9f3db66ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm
@@ -0,0 +1,428 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_lpf_horizontal_8_neon|
+ EXPORT |aom_lpf_vertical_8_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently aom only works on iterations 8 at a time. The aom loop filter
+; works on 16 iterations at a time.
+;
+; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|aom_lpf_horizontal_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #12] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r2, r3, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r3@64], r1 ; p3
+ vld1.u8 {d4}, [r2@64], r1 ; p2
+ vld1.u8 {d5}, [r3@64], r1 ; p1
+ vld1.u8 {d6}, [r2@64], r1 ; p0
+ vld1.u8 {d7}, [r3@64], r1 ; q0
+ vld1.u8 {d16}, [r2@64], r1 ; q1
+ vld1.u8 {d17}, [r3@64] ; q2
+ vld1.u8 {d18}, [r2@64], r1 ; q3
+
+ sub r3, r3, r1, lsl #1
+ sub r2, r2, r1, lsl #2
+
+ bl aom_mbloop_filter_neon
+
+ vst1.u8 {d0}, [r2@64], r1 ; store op2
+ vst1.u8 {d1}, [r3@64], r1 ; store op1
+ vst1.u8 {d2}, [r2@64], r1 ; store op0
+ vst1.u8 {d3}, [r3@64], r1 ; store oq0
+ vst1.u8 {d4}, [r2@64], r1 ; store oq1
+ vst1.u8 {d5}, [r3@64], r1 ; store oq2
+
+ pop {r4-r5, pc}
+
+ ENDP ; |aom_lpf_horizontal_8_neon|
+
+; void aom_lpf_vertical_8_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|aom_lpf_vertical_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #12] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ sub r2, r0, #3
+ add r3, r0, #1
+
+ bl aom_mbloop_filter_neon
+
+ ;store op2, op1, op0, oq0
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+ ;store oq1, oq2
+ vst2.8 {d4[0], d5[0]}, [r3], r1
+ vst2.8 {d4[1], d5[1]}, [r3], r1
+ vst2.8 {d4[2], d5[2]}, [r3], r1
+ vst2.8 {d4[3], d5[3]}, [r3], r1
+ vst2.8 {d4[4], d5[4]}, [r3], r1
+ vst2.8 {d4[5], d5[5]}, [r3], r1
+ vst2.8 {d4[6], d5[6]}, [r3], r1
+ vst2.8 {d4[7], d5[7]}, [r3]
+
+ pop {r4-r5, pc}
+ ENDP ; |aom_lpf_vertical_8_neon|
+
+; void aom_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d0 op2
+; d1 op1
+; d2 op0
+; d3 oq0
+; d4 oq1
+; d5 oq2
+|aom_mbloop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2)
+
+ vmax.u8 d23, d23, d24 ; m3 = max(m5, m6)
+
+ vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2)
+
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0)
+ vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0)
+ vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d1, d19
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; m4 = max(m7, m8)
+ vmax.u8 d26, d27, d28 ; m5 = max(m10, m11)
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+
+ vmax.u8 d25, d25, d26 ; m4 = max(m4, m5)
+
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmax.u8 d20, d20, d25 ; m2 = max(m2, m4)
+
+ vmov.u8 d23, #1
+ vcge.u8 d24, d0, d24 ; a > blimit
+
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+
+ vcge.u8 d20, d23, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+
+ vand d20, d20, d19 ; flat & mask
+
+ vmov.u8 d22, #0x80
+
+ vorr d23, d21, d23 ; hev
+
+ ; This instruction will truncate the "flat & mask" masks down to 4 bits
+ ; each to fit into one 32 bit arm register. The values are stored in
+ ; q10.64[0].
+ vshrn.u16 d30, q10, #4
+ vmov.u32 r4, d30[0] ; flat & mask 4bits
+
+ adds r5, r4, #1 ; Check for all 1's
+
+ ; If mask and flat are 1's for all vectors, then we only need to execute
+ ; the power branch for all vectors.
+ beq power_branch_only
+
+ cmp r4, #0 ; Check for 0, set flag for later
+
+ ; mbfilter() function
+ ; filter() function
+ ; convert to signed
+ veor d21, d7, d22 ; qs0
+ veor d24, d6, d22 ; ps0
+ veor d25, d5, d22 ; ps1
+ veor d26, d16, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d21, d24 ; ( qs0 - ps0)
+
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+
+ vand d29, d29, d23 ; filter &= hev
+
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d23 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ ; If mask and flat are 0's for all vectors, then we only need to execute
+ ; the filter branch for all vectors.
+ beq filter_branch_only
+
+ ; If mask and flat are mixed then we must perform both branches and
+ ; combine the data.
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d21, d21, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ ; At this point we have already executed the filter branch. The filter
+ ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+ ; branch and combine the data.
+ vmov.u8 d23, #2
+ vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3
+ vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2
+
+ vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask)
+
+ vaddw.u8 q14, d5 ; r_op2 += p1
+
+ vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask)
+
+ vqrshrn.u16 d30, q14, #3 ; r_op2
+
+ vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3
+ vsubw.u8 q14, d4 ; r_op1 -= p2
+ vaddw.u8 q14, d5 ; r_op1 += p1
+ vaddw.u8 q14, d16 ; r_op1 += q1
+
+ vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask)
+
+ vqrshrn.u16 d31, q14, #3 ; r_op1
+
+ vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3
+ vsubw.u8 q14, d5 ; r_op0 -= p1
+ vaddw.u8 q14, d6 ; r_op0 += p0
+ vaddw.u8 q14, d17 ; r_op0 += q2
+
+ vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask)
+
+ vqrshrn.u16 d23, q14, #3 ; r_op0
+
+ vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3
+ vsubw.u8 q14, d6 ; r_oq0 -= p0
+ vaddw.u8 q14, d7 ; r_oq0 += q0
+
+ vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask)
+
+ vaddw.u8 q14, d18 ; oq0 += q3
+
+ vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask)
+
+ vqrshrn.u16 d22, q14, #3 ; r_oq0
+
+ vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2
+ vsubw.u8 q14, d7 ; r_oq1 -= q0
+ vaddw.u8 q14, d16 ; r_oq1 += q1
+
+ vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask)
+
+ vaddw.u8 q14, d18 ; r_oq1 += q3
+
+ vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask)
+
+ vqrshrn.u16 d6, q14, #3 ; r_oq1
+
+ vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1
+ vsubw.u8 q14, d16 ; r_oq2 -= q1
+ vaddw.u8 q14, d17 ; r_oq2 += q2
+ vaddw.u8 q14, d18 ; r_oq2 += q3
+
+ vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask)
+
+ vqrshrn.u16 d7, q14, #3 ; r_oq2
+
+ vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask)
+ vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask)
+ vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask)
+
+ bx lr
+
+power_branch_only
+ vmov.u8 d27, #3
+ vmov.u8 d21, #2
+ vaddl.u8 q14, d6, d7 ; op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; op2 += p3 * 3
+ vmlal.u8 q14, d4, d21 ; op2 += p2 * 2
+ vaddw.u8 q14, d5 ; op2 += p1
+ vqrshrn.u16 d0, q14, #3 ; op2
+
+ vsubw.u8 q14, d3 ; op1 = op2 - p3
+ vsubw.u8 q14, d4 ; op1 -= p2
+ vaddw.u8 q14, d5 ; op1 += p1
+ vaddw.u8 q14, d16 ; op1 += q1
+ vqrshrn.u16 d1, q14, #3 ; op1
+
+ vsubw.u8 q14, d3 ; op0 = op1 - p3
+ vsubw.u8 q14, d5 ; op0 -= p1
+ vaddw.u8 q14, d6 ; op0 += p0
+ vaddw.u8 q14, d17 ; op0 += q2
+ vqrshrn.u16 d2, q14, #3 ; op0
+
+ vsubw.u8 q14, d3 ; oq0 = op0 - p3
+ vsubw.u8 q14, d6 ; oq0 -= p0
+ vaddw.u8 q14, d7 ; oq0 += q0
+ vaddw.u8 q14, d18 ; oq0 += q3
+ vqrshrn.u16 d3, q14, #3 ; oq0
+
+ vsubw.u8 q14, d4 ; oq1 = oq0 - p2
+ vsubw.u8 q14, d7 ; oq1 -= q0
+ vaddw.u8 q14, d16 ; oq1 += q1
+ vaddw.u8 q14, d18 ; oq1 += q3
+ vqrshrn.u16 d4, q14, #3 ; oq1
+
+ vsubw.u8 q14, d5 ; oq2 = oq1 - p1
+ vsubw.u8 q14, d16 ; oq2 -= q1
+ vaddw.u8 q14, d17 ; oq2 += q2
+ vaddw.u8 q14, d18 ; oq2 += q3
+ vqrshrn.u16 d5, q14, #3 ; oq2
+
+ bx lr
+
+filter_branch_only
+ ; TODO(fgalligan): See if we can rearange registers so we do not need to
+ ; do the 2 vswp.
+ vswp d0, d4 ; op2
+ vswp d5, d17 ; oq2
+ veor d2, d24, d22 ; *op0 = u^0x80
+ veor d3, d21, d22 ; *oq0 = u^0x80
+ veor d1, d25, d22 ; *op1 = u^0x80
+ veor d4, d26, d22 ; *oq1 = u^0x80
+
+ bx lr
+
+ ENDP ; |aom_mbloop_filter_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
new file mode 100644
index 000000000..c4502fdb5
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+ &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < 1; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+ d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+ &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
new file mode 100644
index 000000000..675928860
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
@@ -0,0 +1,638 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+ EXPORT |aom_lpf_horizontal_edge_8_neon|
+ EXPORT |aom_lpf_horizontal_edge_16_neon|
+ EXPORT |aom_lpf_vertical_16_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_horizontal_edge| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+h_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d0}, [r8@64], r1 ; p7
+ vld1.u8 {d1}, [r8@64], r1 ; p6
+ vld1.u8 {d2}, [r8@64], r1 ; p5
+ vld1.u8 {d3}, [r8@64], r1 ; p4
+ vld1.u8 {d4}, [r8@64], r1 ; p3
+ vld1.u8 {d5}, [r8@64], r1 ; p2
+ vld1.u8 {d6}, [r8@64], r1 ; p1
+ vld1.u8 {d7}, [r8@64], r1 ; p0
+ vld1.u8 {d8}, [r8@64], r1 ; q0
+ vld1.u8 {d9}, [r8@64], r1 ; q1
+ vld1.u8 {d10}, [r8@64], r1 ; q2
+ vld1.u8 {d11}, [r8@64], r1 ; q3
+ vld1.u8 {d12}, [r8@64], r1 ; q4
+ vld1.u8 {d13}, [r8@64], r1 ; q5
+ vld1.u8 {d14}, [r8@64], r1 ; q6
+ vld1.u8 {d15}, [r8@64], r1 ; q7
+
+ bl aom_wide_mbfilter_neon
+
+ tst r7, #1
+ beq h_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, r1, lsl #1
+
+ vst1.u8 {d25}, [r8@64], r1 ; store op1
+ vst1.u8 {d24}, [r8@64], r1 ; store op0
+ vst1.u8 {d23}, [r8@64], r1 ; store oq0
+ vst1.u8 {d26}, [r8@64], r1 ; store oq1
+
+ b h_next
+
+h_mbfilter
+ tst r7, #2
+ beq h_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, r1, lsl #1
+ sub r8, r8, r1
+
+ vst1.u8 {d18}, [r8@64], r1 ; store op2
+ vst1.u8 {d19}, [r8@64], r1 ; store op1
+ vst1.u8 {d20}, [r8@64], r1 ; store op0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq0
+ vst1.u8 {d22}, [r8@64], r1 ; store oq1
+ vst1.u8 {d23}, [r8@64], r1 ; store oq2
+
+ b h_next
+
+h_wide_mbfilter
+ sub r8, r0, r1, lsl #3
+ add r8, r8, r1
+
+ vst1.u8 {d16}, [r8@64], r1 ; store op6
+ vst1.u8 {d24}, [r8@64], r1 ; store op5
+ vst1.u8 {d25}, [r8@64], r1 ; store op4
+ vst1.u8 {d26}, [r8@64], r1 ; store op3
+ vst1.u8 {d27}, [r8@64], r1 ; store op2
+ vst1.u8 {d18}, [r8@64], r1 ; store op1
+ vst1.u8 {d19}, [r8@64], r1 ; store op0
+ vst1.u8 {d20}, [r8@64], r1 ; store oq0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq1
+ vst1.u8 {d22}, [r8@64], r1 ; store oq2
+ vst1.u8 {d23}, [r8@64], r1 ; store oq3
+ vst1.u8 {d1}, [r8@64], r1 ; store oq4
+ vst1.u8 {d2}, [r8@64], r1 ; store oq5
+ vst1.u8 {d3}, [r8@64], r1 ; store oq6
+
+h_next
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne h_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_horizontal_edge|
+
+; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|aom_lpf_horizontal_edge_8_neon| PROC
+ mov r12, #1
+ b mb_lpf_horizontal_edge
+ ENDP ; |aom_lpf_horizontal_edge_8_neon|
+
+; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|aom_lpf_horizontal_edge_16_neon| PROC
+ mov r12, #2
+ b mb_lpf_horizontal_edge
+ ENDP ; |aom_lpf_horizontal_edge_16_neon|
+
+; void aom_lpf_vertical_16_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|aom_lpf_vertical_16_neon| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, #8
+
+ vld1.8 {d0}, [r8@64], r1
+ vld1.8 {d8}, [r0@64], r1
+ vld1.8 {d1}, [r8@64], r1
+ vld1.8 {d9}, [r0@64], r1
+ vld1.8 {d2}, [r8@64], r1
+ vld1.8 {d10}, [r0@64], r1
+ vld1.8 {d3}, [r8@64], r1
+ vld1.8 {d11}, [r0@64], r1
+ vld1.8 {d4}, [r8@64], r1
+ vld1.8 {d12}, [r0@64], r1
+ vld1.8 {d5}, [r8@64], r1
+ vld1.8 {d13}, [r0@64], r1
+ vld1.8 {d6}, [r8@64], r1
+ vld1.8 {d14}, [r0@64], r1
+ vld1.8 {d7}, [r8@64], r1
+ vld1.8 {d15}, [r0@64], r1
+
+ sub r0, r0, r1, lsl #3
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+ vtrn.8 d4, d5
+ vtrn.8 d6, d7
+
+ vtrn.8 d8, d9
+ vtrn.8 d10, d11
+ vtrn.8 d12, d13
+ vtrn.8 d14, d15
+
+ bl aom_wide_mbfilter_neon
+
+ tst r7, #1
+ beq v_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, #2
+
+ vswp d23, d25
+
+ vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+ vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+ vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+ vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+ vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+ vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+ vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+ vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+ b v_end
+
+v_mbfilter
+ tst r7, #2
+ beq v_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, #3
+
+ vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
+ vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
+ vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
+ vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
+ vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
+ vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
+ vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
+ vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
+ vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
+ vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
+ vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
+ vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
+ vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
+ vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
+ vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
+ vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
+
+ b v_end
+
+v_wide_mbfilter
+ sub r8, r0, #8
+
+ vtrn.32 d0, d26
+ vtrn.32 d16, d27
+ vtrn.32 d24, d18
+ vtrn.32 d25, d19
+
+ vtrn.16 d0, d24
+ vtrn.16 d16, d25
+ vtrn.16 d26, d18
+ vtrn.16 d27, d19
+
+ vtrn.8 d0, d16
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d18, d19
+
+ vtrn.32 d20, d1
+ vtrn.32 d21, d2
+ vtrn.32 d22, d3
+ vtrn.32 d23, d15
+
+ vtrn.16 d20, d22
+ vtrn.16 d21, d23
+ vtrn.16 d1, d3
+ vtrn.16 d2, d15
+
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d1, d2
+ vtrn.8 d3, d15
+
+ vst1.8 {d0}, [r8@64], r1
+ vst1.8 {d20}, [r0@64], r1
+ vst1.8 {d16}, [r8@64], r1
+ vst1.8 {d21}, [r0@64], r1
+ vst1.8 {d24}, [r8@64], r1
+ vst1.8 {d22}, [r0@64], r1
+ vst1.8 {d25}, [r8@64], r1
+ vst1.8 {d23}, [r0@64], r1
+ vst1.8 {d26}, [r8@64], r1
+ vst1.8 {d1}, [r0@64], r1
+ vst1.8 {d27}, [r8@64], r1
+ vst1.8 {d2}, [r0@64], r1
+ vst1.8 {d18}, [r8@64], r1
+ vst1.8 {d3}, [r0@64], r1
+ vst1.8 {d19}, [r8@64], r1
+ vst1.8 {d15}, [r0@64], r1
+
+v_end
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |aom_lpf_vertical_16_neon|
+
+; void aom_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16 blimit
+; d17 limit
+; d18 thresh
+; d0 p7
+; d1 p6
+; d2 p5
+; d3 p4
+; d4 p3
+; d5 p2
+; d6 p1
+; d7 p0
+; d8 q0
+; d9 q1
+; d10 q2
+; d11 q3
+; d12 q4
+; d13 q5
+; d14 q6
+; d15 q7
+|aom_wide_mbfilter_neon| PROC
+ mov r7, #0
+
+ ; filter_mask
+ vabd.u8 d19, d4, d5 ; abs(p3 - p2)
+ vabd.u8 d20, d5, d6 ; abs(p2 - p1)
+ vabd.u8 d21, d6, d7 ; abs(p1 - p0)
+ vabd.u8 d22, d9, d8 ; abs(q1 - q0)
+ vabd.u8 d23, d10, d9 ; abs(q2 - q1)
+ vabd.u8 d24, d11, d10 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
+ vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
+ vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d7, d8 ; abs(p0 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d17, d19
+
+ ; flatmask4
+ vabd.u8 d25, d7, d5 ; abs(p0 - p2)
+ vabd.u8 d26, d8, d10 ; abs(q0 - q2)
+ vabd.u8 d27, d4, d7 ; abs(p3 - p0)
+ vabd.u8 d28, d11, d8 ; abs(q3 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
+ vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
+ vmax.u8 d25, d25, d26
+ vmax.u8 d20, d20, d25
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmov.u8 d30, #1
+ vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
+
+ vcge.u8 d20, d30, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ ; hevmask
+ vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
+ vorr d21, d21, d22 ; hev
+
+ vand d16, d20, d19 ; flat && mask
+ vmov r5, r6, d16
+
+ ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+ vabd.u8 d22, d3, d7 ; abs(p4 - p0)
+ vabd.u8 d23, d12, d8 ; abs(q4 - q0)
+ vabd.u8 d24, d7, d2 ; abs(p0 - p5)
+ vabd.u8 d25, d8, d13 ; abs(q0 - q5)
+ vabd.u8 d26, d1, d7 ; abs(p6 - p0)
+ vabd.u8 d27, d14, d8 ; abs(q6 - q0)
+ vabd.u8 d28, d0, d7 ; abs(p7 - p0)
+ vabd.u8 d29, d15, d8 ; abs(q7 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
+ vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
+ vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
+ vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
+
+ vmax.u8 d26, d22, d23
+ vmax.u8 d27, d24, d25
+ vmax.u8 d23, d26, d27
+
+ vcge.u8 d18, d30, d23 ; flat2
+
+ vmov.u8 d22, #0x80
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #1 ; Only do filter branch
+
+ vand d17, d18, d16 ; flat2 && flat && mask
+ vmov r5, r6, d17
+
+ ; mbfilter() function
+
+ ; filter() function
+ ; convert to signed
+ veor d23, d8, d22 ; qs0
+ veor d24, d7, d22 ; ps0
+ veor d25, d6, d22 ; ps1
+ veor d26, d9, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+ vand d29, d29, d21 ; filter &= hev
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d21 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d23, d23, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ tst r7, #1
+ bxne lr
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
+ ; mbfilter flat && mask branch
+ ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+ ; and using vibt on the q's?
+ vmov.u8 d29, #2
+ vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
+ vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
+ vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+ vaddl.u8 q10, d4, d5
+ vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+ vaddl.u8 q14, d6, d9
+ vqrshrn.u16 d18, q15, #3 ; r_op2
+
+ vsub.i16 q15, q10
+ vaddl.u8 q10, d4, d6
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d7, d10
+ vqrshrn.u16 d19, q15, #3 ; r_op1
+
+ vsub.i16 q15, q10
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d8, d11
+ vqrshrn.u16 d20, q15, #3 ; r_op0
+
+ vsubw.u8 q15, d4 ; oq0 = op0 - p3
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d9, d11
+ vqrshrn.u16 d21, q15, #3 ; r_oq0
+
+ vsubw.u8 q15, d5 ; oq1 = oq0 - p2
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d10, d11
+ vqrshrn.u16 d22, q15, #3 ; r_oq1
+
+ vsubw.u8 q15, d6 ; oq2 = oq0 - p1
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vadd.i16 q15, q14
+ vqrshrn.u16 d27, q15, #3 ; r_oq2
+
+ ; Filter does not set op2 or oq2, so use p2 and q2.
+ vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
+ vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
+ vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
+ vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
+ vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+ vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
+ vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
+
+ tst r7, #2
+ bxne lr
+
+ ; wide_mbfilter flat2 && flat && mask branch
+ vmov.u8 d16, #7
+ vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
+ vaddl.u8 q12, d2, d3
+ vaddl.u8 q13, d4, d5
+ vaddl.u8 q14, d1, d6
+ vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
+ vadd.i16 q12, q13
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vadd.i16 q15, q12
+ vaddl.u8 q12, d0, d1
+ vaddw.u8 q15, d1
+ vaddl.u8 q13, d0, d2
+ vadd.i16 q14, q15, q14
+ vqrshrn.u16 d16, q15, #4 ; w_op6
+
+ vsub.i16 q15, q14, q12
+ vaddl.u8 q14, d3, d10
+ vqrshrn.u16 d24, q15, #4 ; w_op5
+
+ vsub.i16 q15, q13
+ vaddl.u8 q13, d0, d3
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vqrshrn.u16 d25, q15, #4 ; w_op4
+
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d0, d4
+ vsub.i16 q15, q13
+ vsub.i16 q14, q15, q14
+ vqrshrn.u16 d26, q15, #4 ; w_op3
+
+ vaddw.u8 q15, q14, d5 ; op2 += p2
+ vaddl.u8 q14, d0, d5
+ vaddw.u8 q15, d12 ; op2 += q4
+ vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
+ vqrshrn.u16 d27, q15, #4 ; w_op2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d6
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d13 ; op1 += q5
+ vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
+ vqrshrn.u16 d18, q15, #4 ; w_op1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d7
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d14 ; op0 += q6
+ vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
+ vqrshrn.u16 d19, q15, #4 ; w_op0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d1, d8
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d15 ; oq0 += q7
+ vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
+ vqrshrn.u16 d20, q15, #4 ; w_oq0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddl.u8 q4, d10, d15
+ vaddw.u8 q15, d15 ; oq1 += q7
+ vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
+ vqrshrn.u16 d21, q15, #4 ; w_oq1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d3, d10
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d11, d15
+ vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
+ vqrshrn.u16 d22, q15, #4 ; w_oq2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d12, d15
+ vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
+ vqrshrn.u16 d23, q15, #4 ; w_oq3
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d5, d12
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d13, d15
+ vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
+ vqrshrn.u16 d1, q15, #4 ; w_oq4
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d6, d13
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d14, d15
+ vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
+ vqrshrn.u16 d2, q15, #4 ; w_oq5
+
+ vsub.i16 q15, q14
+ vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
+ vadd.i16 q15, q4
+ vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
+ vqrshrn.u16 d3, q15, #4 ; w_oq6
+ vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
+ vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
+ vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
+
+ bx lr
+ ENDP ; |aom_wide_mbfilter_neon|
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 000000000..c90d6bfde
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+#if HAVE_NEON_ASM
+void aom_lpf_horizontal_8_dual_neon(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif // HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
new file mode 100644
index 000000000..a1eeaf4b7
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8x16_t vec_src_32,
+ const uint8x16_t vec_src_48, const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+ const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+ vget_low_u8(vec_ref_32));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+ vget_high_u8(vec_ref_32));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+ vget_low_u8(vec_ref_48));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+ vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16, const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+}
+
+void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 64; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+ const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 32; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+ sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+ &vec_sum_ref0_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+ &vec_sum_ref1_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+ &vec_sum_ref2_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+ &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t vec_src = vld1q_u8(src);
+ const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+ const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+ const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+ const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+ vec_sum_ref0_lo =
+ vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
+ vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref0));
+ vec_sum_ref1_lo =
+ vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
+ vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref1));
+ vec_sum_ref2_lo =
+ vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
+ vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref2));
+ vec_sum_ref3_lo =
+ vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
+ vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref3));
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/third_party/aom/aom_dsp/arm/sad_media.asm b/third_party/aom/aom_dsp/arm/sad_media.asm
new file mode 100644
index 000000000..49ddb6764
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_media.asm
@@ -0,0 +1,98 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_sad16x16_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 const unsigned char *src_ptr
+; r1 int src_stride
+; r2 const unsigned char *ref_ptr
+; r3 int ref_stride
+|aom_sad16x16_media| PROC
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ mov r4, #0 ; sad = 0;
+ mov r5, #8 ; loop count
+
+loop
+ ; 1st row
+ ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
+
+ usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
+ add r4, r4, r8 ; add partial sad values
+
+ ; 2nd row
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
+
+ usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ subs r5, r5, #1 ; decrement loop counter
+ add r4, r4, r8 ; add partial sad values
+
+ bne loop
+
+ mov r0, r4 ; return sad
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
+
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
new file mode 100644
index 000000000..2f452f55b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 15; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x2_t d1;
+ uint64x1_t d3;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 3; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ d1 = vpaddl_u16(vget_low_u16(q12));
+ d3 = vpaddl_u32(d1);
+
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+ unsigned char *ref_ptr, int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 7; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+ const uint32x4_t a = vpaddlq_u16(vec_16x8);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ int i;
+ uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+ for (i = 0; i < 64; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+ const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+ const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+ src += src_stride;
+ ref += ref_stride;
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+ vget_low_u8(vec_ref_32));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+ vget_high_u8(vec_ref_32));
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+ vget_low_u8(vec_ref_48));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+ vget_high_u8(vec_ref_48));
+ }
+ return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ int i;
+ uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+ for (i = 0; i < 32; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ src += src_stride;
+ ref += ref_stride;
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ }
+ return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ int i;
+ uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t vec_src = vld1q_u8(src);
+ const uint8x16_t vec_ref = vld1q_u8(ref);
+ src += src_stride;
+ ref += ref_stride;
+ vec_accum_lo =
+ vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+ vec_accum_hi =
+ vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+ }
+ return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ int i;
+ uint16x8_t vec_accum = vdupq_n_u16(0);
+
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t vec_src = vld1_u8(src);
+ const uint8x8_t vec_ref = vld1_u8(ref);
+ src += src_stride;
+ ref += ref_stride;
+ vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+ }
+ return horizontal_add_16x8(vec_accum);
+}
diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm
new file mode 100644
index 000000000..e04969823
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,39 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_push_neon|
+ EXPORT |aom_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|aom_push_neon| PROC
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+|aom_pop_neon| PROC
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_media.c b/third_party/aom/aom_dsp/arm/subpel_variance_media.c
new file mode 100644
index 000000000..46ec028d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_media.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
+ { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 },
+ { 32, 96 }, { 16, 112 } };
+
+extern void aom_filter_block2d_bil_first_pass_media(
+ const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
+ uint32_t height, uint32_t width, const int16_t *filter);
+
+extern void aom_filter_block2d_bil_second_pass_media(
+ const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
+ uint32_t height, uint32_t width, const int16_t *filter);
+
+unsigned int aom_sub_pixel_variance8x8_media(
+ const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+ const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+ uint16_t first_pass[10 * 8];
+ uint8_t second_pass[8 * 8];
+ const int16_t *HFilter, *VFilter;
+
+ HFilter = bilinear_filters_media[xoffset];
+ VFilter = bilinear_filters_media[yoffset];
+
+ aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+ src_pixels_per_line, 9, 8, HFilter);
+ aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
+ VFilter);
+
+ return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
+ sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_media(
+ const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+ const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+ uint16_t first_pass[36 * 16];
+ uint8_t second_pass[20 * 16];
+ const int16_t *HFilter, *VFilter;
+ unsigned int var;
+
+ if (xoffset == 4 && yoffset == 0) {
+ var = aom_variance_halfpixvar16x16_h_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ } else if (xoffset == 0 && yoffset == 4) {
+ var = aom_variance_halfpixvar16x16_v_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ } else if (xoffset == 4 && yoffset == 4) {
+ var = aom_variance_halfpixvar16x16_hv_media(
+ src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ } else {
+ HFilter = bilinear_filters_media[xoffset];
+ VFilter = bilinear_filters_media[yoffset];
+
+ aom_filter_block2d_bil_first_pass_media(
+ src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
+ aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
+ 16, VFilter);
+
+ var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
+ sse);
+ }
+ return var;
+}
+#endif // HAVE_MEDIA
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 000000000..064b72d6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vmov_n_u8(filter[0]);
+ const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vmov_n_u8(filter[0]);
+ const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 16) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+ bilinear_filters[yoffset]);
+ return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+ bilinear_filters[yoffset]);
+ return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+ bilinear_filters[yoffset]);
+ return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
+ int src_stride, int xoffset,
+ int yoffset, const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+ bilinear_filters[yoffset]);
+ return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 000000000..cb8a2daf8
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r, c;
+
+ if (cols > 16) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 =
+ vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 =
+ vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 =
+ vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 =
+ vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 8) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo =
+ vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi =
+ vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 4) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
new file mode 100644
index 000000000..1e5c9178e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -0,0 +1,185 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_variance_halfpixvar16x16_h_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_h_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
new file mode 100644
index 000000000..9e0af830e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -0,0 +1,225 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_variance_halfpixvar16x16_hv_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_hv_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
new file mode 100644
index 000000000..545b68179
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -0,0 +1,187 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_variance_halfpixvar16x16_v_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_v_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/third_party/aom/aom_dsp/arm/variance_media.asm b/third_party/aom/aom_dsp/arm/variance_media.asm
new file mode 100644
index 000000000..fdc311a81
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_media.asm
@@ -0,0 +1,361 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+ EXPORT |aom_variance16x16_media|
+ EXPORT |aom_variance8x8_media|
+ EXPORT |aom_mse16x16_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|aom_variance16x16_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+loop16x16
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop16x16
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|aom_variance8x8_media| PROC
+
+ push {r4-r10, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #8 ; set loop counter to 8 (=block height)
+ mov r4, #0 ; initialize sum = 0
+ mov r5, #0 ; initialize sse = 0
+
+loop8x8
+ ; 1st 4 pixels
+ ldr r6, [r0, #0x0] ; load 4 src pixels
+ ldr r7, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r6, r7 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r0, #0x4] ; load 4 src pixels
+ ldr r7, [r2, #0x4] ; load 4 ref pixels
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r6, r7 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1 ; next row
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ bne loop8x8
+
+ ; return stuff
+ ldr r8, [sp, #32] ; get address of sse
+ mul r1, r4, r4 ; sum * sum
+ str r5, [r8] ; store sse
+ sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on aom_variance16x16_media. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|aom_mse16x16_media| PROC
+
+ push {r4-r9, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r4, #0 ; initialize sse = 0
+
+loopmse
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loopmse
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
new file mode 100644
index 000000000..dbab287e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse,
+ int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo =
+ vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+ v_sse_hi =
+ vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, unsigned int *sse, int *sum) {
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, unsigned int *sse, int *sum) {
+ variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+ return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
+}
+
+unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+ variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+ 32, 32, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
+}
+
+unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride, unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) {
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride, unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d2u8, d4u8, d6u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint16x8_t q11u16, q12u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) {
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d2u8, d6u8);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int64x1_t d0s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ q7s32 = vdupq_n_s32(0);
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) { // mse16x16_neon_loop
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+ q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+ q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q10s32 = vaddq_s32(q7s32, q9s32);
+
+ q1s64 = vpaddlq_s32(q10s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride) {
+ int16x4_t d22s16, d24s16, d26s16, d28s16;
+ int64x1_t d0s64;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d1u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d5u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d3u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d7u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d1u8, d5u8);
+ q13u16 = vsubl_u8(d2u8, d6u8);
+ q14u16 = vsubl_u8(d3u8, d7u8);
+
+ d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+ d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+ d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+ d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+ q7s32 = vmull_s16(d22s16, d22s16);
+ q8s32 = vmull_s16(d24s16, d24s16);
+ q9s32 = vmull_s16(d26s16, d26s16);
+ q10s32 = vmull_s16(d28s16, d28s16);
+
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q9s32 = vaddq_s32(q7s32, q9s32);
+
+ q1s64 = vpaddlq_s32(q9s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c
new file mode 100644
index 000000000..eb6059705
--- /dev/null
+++ b/third_party/aom/aom_dsp/avg.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 8; ++i, src += stride)
+ for (j = 0; j < 8; sum += src[j], ++j) {
+ }
+
+ return ROUND_POWER_OF_TWO(sum, 6);
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 4; ++i, src += stride)
+ for (j = 0; j < 4; sum += src[j], ++j) {
+ }
+
+ return ROUND_POWER_OF_TWO(sum, 4);
+}
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+// second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ coeff += 8; // coeff: 15 bit
+ // dynamic range [-16320, 16320]
+ ++tmp_buf;
+ }
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ // coeff: 15 bit, dynamic range [-16320, 16320]
+ for (idx = 0; idx < 64; ++idx) {
+ int16_t a0 = coeff[0];
+ int16_t a1 = coeff[64];
+ int16_t a2 = coeff[128];
+ int16_t a3 = coeff[192];
+
+ int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ int16_t b3 = (a2 - a3) >> 1;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_c(const int16_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64}.
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride,
+ int height) {
+ int idx;
+ const int norm_factor = height >> 1;
+ for (idx = 0; idx < 16; ++idx) {
+ int i;
+ hbuf[idx] = 0;
+ // hbuf[idx]: 14 bit, dynamic range [0, 16320].
+ for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+ // hbuf[idx]: 9 bit, dynamic range [0, 510].
+ hbuf[idx] /= norm_factor;
+ ++ref;
+ }
+}
+
+// width: value range {16, 32, 64}.
+int16_t aom_int_pro_col_c(const uint8_t *ref, int width) {
+ int idx;
+ int16_t sum = 0;
+ // sum: 14 bit, dynamic range [0, 16320]
+ for (idx = 0; idx < width; ++idx) sum += ref[idx];
+ return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
+ int i;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
+ mean += diff; // mean: dynamic range 16 bits.
+ sse += diff * diff; // sse: dynamic range 26 bits.
+ }
+
+ // (mean * mean): dynamic range 31 bits.
+ var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
+}
+
+void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int *min, int *max) {
+ int i, j;
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(src[j] - ref[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+ for (i = 0; i < 8; ++i, s += stride)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return ROUND_POWER_OF_TWO(sum, 6);
+}
+
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+ for (i = 0; i < 4; ++i, s += stride)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return ROUND_POWER_OF_TWO(sum, 4);
+}
+
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ int i, j;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
new file mode 100644
index 000000000..96c4cb436
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/common/common.h"
+
+// Inverse recenters a non-negative literal v around a reference r
+static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+ if (v > (r << 1))
+ return v;
+ else if ((v & 1) == 0)
+ return (v >> 1) + r;
+ else
+ return r - ((v + 1) >> 1);
+}
+
+// Inverse recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
+ if ((r << 1) <= n) {
+ return inv_recenter_nonneg(r, v);
+ } else {
+ return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
+ }
+}
+
+int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) {
+ if (aom_read_bit(r, NULL)) {
+ int s = aom_read_bit(r, NULL);
+ int16_t x = aom_read_literal(r, mag_bits, NULL) + 1;
+ return (s > 0 ? -x : x);
+ } else {
+ return 0;
+ }
+}
+
+uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n) {
+ if (n <= 1) return 0;
+ const int l = get_msb(n - 1) + 1;
+ const int m = (1 << l) - n;
+ const int v = aom_read_literal(r, l - 1, NULL);
+ return v < m ? v : (v << 1) - m + aom_read_bit(r, NULL);
+}
+
+uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p,
+ uint16_t ref) {
+ if (n <= 1) return 0;
+ assert(p > 0 && p <= n);
+ assert(ref < n);
+ int lolimit = ref - p / 2;
+ const int hilimit = lolimit + p - 1;
+ if (lolimit < 0) {
+ lolimit = 0;
+ } else if (hilimit >= n) {
+ lolimit = n - p;
+ }
+ int v;
+ if (aom_read_bit(r, NULL)) {
+ v = aom_read_primitive_quniform(r, p) + lolimit;
+ } else {
+ v = aom_read_primitive_quniform(r, n - p);
+ if (v >= lolimit) v += p;
+ }
+ return v;
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) {
+ int i = 0;
+ int mk = 0;
+ uint16_t v;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ v = aom_read_primitive_quniform(r, n - mk) + mk;
+ break;
+ } else {
+ if (aom_read_bit(r, NULL)) {
+ i = i + 1;
+ mk += a;
+ } else {
+ v = aom_read_literal(r, b, NULL) + mk;
+ break;
+ }
+ }
+ }
+ return v;
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+// based on a reference ref also in [0, n-1].
+uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k,
+ uint16_t ref) {
+ return inv_recenter_finite_nonneg(n, ref,
+ aom_read_primitive_subexpfin(r, n, k));
+}
+
+// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n,
+ uint16_t k, int16_t ref) {
+ ref += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref) - n + 1;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
new file mode 100644
index 000000000..738d91da8
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_DSP_BINARY_CODES_READER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+
+int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits);
+
+uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n);
+uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p,
+ uint16_t ref);
+uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k);
+uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k,
+ uint16_t ref);
+int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n,
+ uint16_t k, int16_t ref);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
new file mode 100644
index 000000000..91e807b29
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitwriter.h"
+
+#include "av1/common/common.h"
+
+// Recenters a non-negative literal v around a reference r
+static uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+ if (v > (r << 1))
+ return v;
+ else if (v >= r)
+ return ((v - r) << 1);
+ else
+ return ((r - v) << 1) - 1;
+}
+
+// Recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
+ if ((r << 1) <= n) {
+ return recenter_nonneg(r, v);
+ } else {
+ return recenter_nonneg(n - 1 - r, n - 1 - v);
+ }
+}
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+ unsigned int abs_bits) {
+ if (v == 0) {
+ aom_write_bit(w, 0);
+ } else {
+ const int x = abs(v);
+ const int s = v < 0;
+ aom_write_bit(w, 1);
+ aom_write_bit(w, s);
+ aom_write_literal(w, x - 1, abs_bits);
+ }
+}
+
+int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
+ return (v == 0 ? 1 : abs_bits + 2);
+}
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
+ if (n <= 1) return;
+ const int l = get_msb(n - 1) + 1;
+ const int m = (1 << l) - n;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_bit(w, (v - m) & 1);
+ }
+}
+
+int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+ if (n <= 1) return 0;
+ const int l = get_msb(n - 1) + 1;
+ const int m = (1 << l) - n;
+ return v < m ? l - 1 : l;
+}
+
+// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
+// The closest p values of v from ref are coded using a p-ary quasi-unoform
+// short code while the remaining n-p values are coded with a longer code.
+void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
+ uint16_t ref, uint16_t v) {
+ if (n <= 1) return;
+ assert(p > 0 && p <= n);
+ assert(ref < n);
+ int lolimit = ref - p / 2;
+ int hilimit = lolimit + p - 1;
+ if (lolimit < 0) {
+ lolimit = 0;
+ hilimit = p - 1;
+ } else if (hilimit >= n) {
+ hilimit = n - 1;
+ lolimit = n - p;
+ }
+ if (v >= lolimit && v <= hilimit) {
+ aom_write_bit(w, 1);
+ v = v - lolimit;
+ aom_write_primitive_quniform(w, p, v);
+ } else {
+ aom_write_bit(w, 0);
+ if (v > hilimit) v -= p;
+ aom_write_primitive_quniform(w, n - p, v);
+ }
+}
+
+int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
+ uint16_t v) {
+ if (n <= 1) return 0;
+ assert(p > 0 && p <= n);
+ assert(ref < n);
+ int lolimit = ref - p / 2;
+ int hilimit = lolimit + p - 1;
+ if (lolimit < 0) {
+ lolimit = 0;
+ hilimit = p - 1;
+ } else if (hilimit >= n) {
+ hilimit = n - 1;
+ lolimit = n - p;
+ }
+ int count = 0;
+ if (v >= lolimit && v <= hilimit) {
+ count++;
+ v = v - lolimit;
+ count += aom_count_primitive_quniform(p, v);
+ } else {
+ count++;
+ if (v > hilimit) v -= p;
+ count += aom_count_primitive_quniform(n - p, v);
+ }
+ return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t v) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ aom_write_primitive_quniform(w, n - mk, v - mk);
+ break;
+ } else {
+ int t = (v >= mk + a);
+ aom_write_bit(w, t);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ aom_write_literal(w, v - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+ int count = 0;
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ count += aom_count_primitive_quniform(n - mk, v - mk);
+ break;
+ } else {
+ int t = (v >= mk + a);
+ count++;
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ count += b;
+ break;
+ }
+ }
+ }
+ return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+// Recenters symbol around r first and then uses a finite subexponential code.
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ int16_t ref, int16_t v) {
+ aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+ uint16_t k, uint16_t ref,
+ uint16_t v) {
+ ref += n - 1;
+ v += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
+}
+
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+ uint16_t v) {
+ return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+ int16_t v) {
+ ref += n - 1;
+ v += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
new file mode 100644
index 000000000..ab5ccbf15
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_DSP_BINARY_CODES_WRITER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitwriter.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+ unsigned int mag_bits);
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
+
+// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
+// The closest p values of v from ref are coded using a p-ary quasi-unoform
+// short code while the remaining n-p values are coded with a longer code.
+void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
+ uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+ uint16_t k, int16_t ref,
+ int16_t v);
+
+// Functions that counts bits for the above primitives
+int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
+int aom_count_primitive_quniform(uint16_t n, uint16_t v);
+int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
+ uint16_t v);
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+ uint16_t v);
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+ int16_t v);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
new file mode 100644
index 000000000..9cd34dd48
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITREADER_H_
+#define AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
+#endif
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#if CONFIG_ANS
+#include "aom_dsp/ansreader.h"
+#elif CONFIG_DAALA_EC
+#include "aom_dsp/daalaboolreader.h"
+#else
+#include "aom_dsp/dkboolreader.h"
+#endif
+#include "aom_dsp/prob.h"
+#include "av1/common/odintrin.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+ aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+ aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+ aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+ aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
+ aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_ANS
+typedef struct AnsDecoder aom_reader;
+#elif CONFIG_DAALA_EC
+typedef struct daala_reader aom_reader;
+#else
+typedef struct aom_dk_reader aom_reader;
+#endif
+
+static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
+ size_t size, aom_decrypt_cb decrypt_cb,
+ void *decrypt_state) {
+#if CONFIG_ANS
+ (void)decrypt_cb;
+ (void)decrypt_state;
+ if (size > INT_MAX) return 1;
+ return ans_read_init(r, buffer, (int)size);
+#elif CONFIG_DAALA_EC
+ (void)decrypt_cb;
+ (void)decrypt_state;
+ return aom_daala_reader_init(r, buffer, (int)size);
+#else
+ return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
+#endif
+}
+
+static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
+#if CONFIG_ANS
+ (void)r;
+ assert(0 && "Use the raw buffer size with ANS");
+ return NULL;
+#elif CONFIG_DAALA_EC
+ return aom_daala_reader_find_end(r);
+#else
+ return aom_dk_reader_find_end(r);
+#endif
+}
+
+static INLINE int aom_reader_has_error(aom_reader *r) {
+#if CONFIG_ANS
+ return ans_reader_has_error(r);
+#elif CONFIG_DAALA_EC
+ return aom_daala_reader_has_error(r);
+#else
+ return aom_dk_reader_has_error(r);
+#endif
+}
+
+// Returns the position in the bit reader in bits.
+static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
+#if CONFIG_ANS
+ (void)r;
+ assert(0 && "aom_reader_tell() is unimplemented for ANS");
+ return 0;
+#elif CONFIG_DAALA_EC
+ return aom_daala_reader_tell(r);
+#else
+ return aom_dk_reader_tell(r);
+#endif
+}
+
+// Returns the position in the bit reader in 1/8th bits.
+static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
+#if CONFIG_ANS
+ (void)r;
+ assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
+ return 0;
+#elif CONFIG_DAALA_EC
+ return aom_daala_reader_tell_frac(r);
+#else
+ return aom_dk_reader_tell_frac(r);
+#endif
+}
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+ if (r->accounting != NULL) {
+ uint32_t tell_frac;
+ tell_frac = aom_reader_tell_frac(r);
+ aom_accounting_record(r->accounting, ACCT_STR_NAME,
+ tell_frac - r->accounting->last_tell_frac);
+ r->accounting->last_tell_frac = tell_frac;
+ }
+}
+
+static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+ if (r->accounting != NULL) {
+ r->accounting->syms.num_multi_syms += !is_binary;
+ r->accounting->syms.num_binary_syms += !!is_binary;
+ }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+ int ret;
+#if CONFIG_ANS
+ ret = rabs_read(r, prob);
+#elif CONFIG_DAALA_EC
+ ret = aom_daala_read(r, prob);
+#else
+ ret = aom_dk_read(r, prob);
+#endif
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+ aom_update_symb_counts(r, 1);
+#endif
+ return ret;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+ int ret;
+#if CONFIG_ANS
+ ret = rabs_read_bit(r); // Non trivial optimization at half probability
+#elif CONFIG_DAALA_EC && CONFIG_RAWBITS
+ // Note this uses raw bits and is not the same as aom_daala_read(r, 128);
+ // Calls to this function are omitted from raw symbol accounting.
+ ret = aom_daala_read_bit(r);
+#else
+ ret = aom_read(r, 128, NULL); // aom_prob_half
+#endif
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+ return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+ int literal = 0, bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+ return literal;
+}
+
+static INLINE int aom_read_tree_as_bits(aom_reader *r,
+ const aom_tree_index *tree,
+ const aom_prob *probs) {
+ aom_tree_index i = 0;
+
+ while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
+ return -i;
+}
+
+#if CONFIG_EC_MULTISYMBOL
+static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+ int nsymbs ACCT_STR_PARAM) {
+ int ret;
+#if CONFIG_ANS
+ (void)nsymbs;
+ ret = rans_read(r, cdf);
+#elif CONFIG_DAALA_EC
+ ret = daala_read_symbol(r, cdf, nsymbs);
+#else
+#error \
+ "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+ "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+ aom_update_symb_counts(r, (nsymbs == 2));
+#endif
+ return ret;
+}
+
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+ int nsymbs ACCT_STR_PARAM) {
+ int ret;
+ ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+#if CONFIG_EC_ADAPT
+ update_cdf(cdf, ret, nsymbs);
+#endif
+ return ret;
+}
+
+static INLINE int aom_read_tree_as_cdf(aom_reader *r,
+ const aom_tree_index *tree,
+ const aom_prob *probs) {
+ aom_tree_index i = 0;
+ do {
+ aom_cdf_prob cdf[16];
+ aom_tree_index index[16];
+ int path[16];
+ int dist[16];
+ int nsymbs;
+ int symb;
+ nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
+ symb = aom_read_cdf(r, cdf, nsymbs, NULL);
+ OD_ASSERT(symb >= 0 && symb < nsymbs);
+ i = index[symb];
+ } while (i > 0);
+ return -i;
+}
+#endif // CONFIG_EC_MULTISYMBOL
+
+static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
+ const aom_prob *probs ACCT_STR_PARAM) {
+ int ret;
+#if CONFIG_EC_MULTISYMBOL
+ ret = aom_read_tree_as_cdf(r, tree, probs);
+#else
+ ret = aom_read_tree_as_bits(r, tree, probs);
+#endif
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+ return ret;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
new file mode 100644
index 000000000..009682b4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "./aom_config.h"
+#include "./bitreader_buffer.h"
+
+size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
+ return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+ const uint32_t off = rb->bit_offset;
+ const uint32_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
+ rb->bit_offset = off + 1;
+ return bit;
+ } else {
+ rb->error_handler(rb->error_handler_data);
+ return 0;
+ }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+ int value = 0, bit;
+ for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+ return value;
+}
+
+int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+ const int value = aom_rb_read_literal(rb, bits);
+ return aom_rb_read_bit(rb) ? -value : value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+ const int nbits = sizeof(unsigned) * 8 - bits - 1;
+ const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+ return ((int)value) >> nbits;
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
new file mode 100644
index 000000000..22187357e
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+ const uint8_t *bit_buffer;
+ const uint8_t *bit_buffer_end;
+ uint32_t bit_offset;
+
+ void *error_handler_data;
+ aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
new file mode 100644
index 000000000..6e3fac260
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITWRITER_H_
+#define AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
+#endif
+
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#elif CONFIG_DAALA_EC
+#include "aom_dsp/daalaboolwriter.h"
+#else
+#include "aom_dsp/dkboolwriter.h"
+#endif
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/common/blockd.h"
+#include "av1/encoder/cost.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_ANS
+typedef struct BufAnsCoder aom_writer;
+#elif CONFIG_DAALA_EC
+typedef struct daala_writer aom_writer;
+#else
+typedef struct aom_dk_writer aom_writer;
+#endif
+
+typedef struct TOKEN_STATS {
+ int cost;
+#if CONFIG_VAR_TX
+#if CONFIG_RD_DEBUG
+ int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
+#endif
+#endif
+} TOKEN_STATS;
+
+static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+#if CONFIG_VAR_TX
+#if CONFIG_RD_DEBUG
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ token_stats->txb_coeff_cost_map[r][c] = 0;
+ }
+ }
+#endif
+#endif
+ token_stats->cost = 0;
+}
+
+static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
+#if CONFIG_ANS
+ (void)bc;
+ (void)buffer;
+ assert(0 && "buf_ans requires a more complicated startup procedure");
+#elif CONFIG_DAALA_EC
+ aom_daala_start_encode(bc, buffer);
+#else
+ aom_dk_start_encode(bc, buffer);
+#endif
+}
+
+static INLINE void aom_stop_encode(aom_writer *bc) {
+#if CONFIG_ANS
+ (void)bc;
+ assert(0 && "buf_ans requires a more complicated shutdown procedure");
+#elif CONFIG_DAALA_EC
+ aom_daala_stop_encode(bc);
+#else
+ aom_dk_stop_encode(bc);
+#endif
+}
+
+static INLINE void aom_write(aom_writer *br, int bit, int probability) {
+#if CONFIG_ANS
+ buf_rabs_write(br, bit, probability);
+#elif CONFIG_DAALA_EC
+ aom_daala_write(br, bit, probability);
+#else
+ aom_dk_write(br, bit, probability);
+#endif
+}
+
+static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
+ TOKEN_STATS *token_stats) {
+ aom_write(br, bit, probability);
+#if CONFIG_RD_DEBUG
+ token_stats->cost += av1_cost_bit(probability, bit);
+#else
+ (void)token_stats;
+#endif
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+#if CONFIG_ANS
+ buf_rabs_write_bit(w, bit);
+#elif CONFIG_DAALA_EC && CONFIG_RAWBITS
+ // Note this uses raw bits and is not the same as aom_daala_write(r, 128);
+ aom_daala_write_bit(w, bit);
+#else
+ aom_write(w, bit, 128); // aom_prob_half
+#endif
+}
+
+static INLINE void aom_write_bit_record(aom_writer *w, int bit,
+ TOKEN_STATS *token_stats) {
+ aom_write_bit(w, bit);
+#if CONFIG_RD_DEBUG
+ token_stats->cost += av1_cost_bit(128, bit); // aom_prob_half
+#else
+ (void)token_stats;
+#endif
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_tree_as_bits(aom_writer *w,
+ const aom_tree_index *tr,
+ const aom_prob *probs, int bits,
+ int len, aom_tree_index i) {
+ do {
+ const int bit = (bits >> --len) & 1;
+ aom_write(w, bit, probs[i >> 1]);
+ i = tr[i + bit];
+ } while (len);
+}
+
+static INLINE void aom_write_tree_as_bits_record(
+ aom_writer *w, const aom_tree_index *tr, const aom_prob *probs, int bits,
+ int len, aom_tree_index i, TOKEN_STATS *token_stats) {
+ do {
+ const int bit = (bits >> --len) & 1;
+ aom_write_record(w, bit, probs[i >> 1], token_stats);
+ i = tr[i + bit];
+ } while (len);
+}
+
+#if CONFIG_EC_MULTISYMBOL
+static INLINE void aom_write_cdf(aom_writer *w, int symb,
+ const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_ANS
+ (void)nsymbs;
+ assert(cdf);
+ const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0;
+ const aom_cdf_prob prob = cdf[symb] - cum_prob;
+ buf_rans_write(w, cum_prob, prob);
+#elif CONFIG_DAALA_EC
+ daala_write_symbol(w, symb, cdf, nsymbs);
+#else
+#error \
+ "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+ "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+}
+
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+ int nsymbs) {
+ aom_write_cdf(w, symb, cdf, nsymbs);
+#if CONFIG_EC_ADAPT
+ update_cdf(cdf, symb, nsymbs);
+#endif
+}
+
+static INLINE void aom_write_tree_as_cdf(aom_writer *w,
+ const aom_tree_index *tree,
+ const aom_prob *probs, int bits,
+ int len, aom_tree_index i) {
+ aom_tree_index root;
+ root = i;
+ do {
+ aom_cdf_prob cdf[16];
+ aom_tree_index index[16];
+ int path[16];
+ int dist[16];
+ int nsymbs;
+ int symb;
+ int j;
+ /* Compute the CDF of the binary tree using the given probabilities. */
+ nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
+ /* Find the symbol to code. */
+ symb = -1;
+ for (j = 0; j < nsymbs; j++) {
+ /* If this symbol codes a leaf node, */
+ if (index[j] <= 0) {
+ if (len == dist[j] && path[j] == bits) {
+ symb = j;
+ break;
+ }
+ } else {
+ if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
+ symb = j;
+ break;
+ }
+ }
+ }
+ OD_ASSERT(symb != -1);
+ aom_write_cdf(w, symb, cdf, nsymbs);
+ bits &= (1 << (len - dist[symb])) - 1;
+ len -= dist[symb];
+ } while (len);
+}
+
+#endif // CONFIG_EC_MULTISYMBOL
+
+static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
+ const aom_prob *probs, int bits, int len,
+ aom_tree_index i) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
+#else
+ aom_write_tree_as_bits(w, tree, probs, bits, len, i);
+#endif
+}
+
+static INLINE void aom_write_tree_record(aom_writer *w,
+ const aom_tree_index *tree,
+ const aom_prob *probs, int bits,
+ int len, aom_tree_index i,
+ TOKEN_STATS *token_stats) {
+#if CONFIG_EC_MULTISYMBOL
+ (void)token_stats;
+ aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
+#else
+ aom_write_tree_as_bits_record(w, tree, probs, bits, len, i, token_stats);
+#endif
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
new file mode 100644
index 000000000..1b3dd2913
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./bitwriter_buffer.h"
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+ return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+ const int off = (int)wb->bit_offset;
+ const int p = off / CHAR_BIT;
+ const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+ if (q == CHAR_BIT - 1) {
+ // Zero next char and write bit
+ wb->bit_buffer[p] = bit << q;
+ } else {
+ wb->bit_buffer[p] &= ~(1 << q);
+ wb->bit_buffer[p] |= bit << q;
+ }
+ wb->bit_offset = off + 1;
+}
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
+ // Do not zero bytes but overwrite exisiting values
+ const int off = (int)wb->bit_offset;
+ const int p = off / CHAR_BIT;
+ const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+ wb->bit_buffer[p] &= ~(1 << q);
+ wb->bit_buffer[p] |= bit << q;
+ wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits) {
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--)
+ aom_wb_overwrite_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits) {
+ aom_wb_write_literal(wb, data, bits + 1);
+}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
new file mode 100644
index 000000000..1f23dc857
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_DSP_BITWRITER_BUFFER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_write_bit_buffer {
+ uint8_t *bit_buffer;
+ uint32_t bit_offset;
+};
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits);
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
new file mode 100644
index 000000000..e5297ff83
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BLEND_H_
+#define AOM_DSP_BLEND_H_
+
+#include "aom_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the aom_blend_* functions in aom_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+
+#define AOM_BLEND_A64_ROUND_BITS 6
+#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
+
+#define AOM_BLEND_A64(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+ AOM_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define AOM_BLEND_A256_ROUND_BITS 8
+#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
+
+#define AOM_BLEND_A256(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+ AOM_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif // AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
new file mode 100644
index 000000000..99b4b8a59
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "./aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(
+ mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(
+ mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
new file mode 100644
index 000000000..3e15542c9
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "./aom_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ int w, int subh, int subw) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = ROUND_POWER_OF_TWO(
+ mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = ROUND_POWER_OF_TWO(
+ mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
new file mode 100644
index 000000000..1a5e30e31
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "./aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c
new file mode 100644
index 000000000..8fe1ff763
--- /dev/null
+++ b/third_party/aom/aom_dsp/buf_ans.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include "aom_dsp/buf_ans.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/internal/aom_codec_internal.h"
+
+void aom_buf_ans_alloc(struct BufAnsCoder *c,
+ struct aom_internal_error_info *error, int size) {
+ c->error = error;
+ c->size = size;
+ assert(c->size > 1);
+ AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
+ // Initialize to overfull to trigger the assert in write.
+ c->offset = c->size + 1;
+}
+
+void aom_buf_ans_free(struct BufAnsCoder *c) {
+ aom_free(c->buf);
+ c->buf = NULL;
+ c->size = 0;
+}
+
+#if !ANS_MAX_SYMBOLS
+void aom_buf_ans_grow(struct BufAnsCoder *c) {
+ struct buffered_ans_symbol *new_buf = NULL;
+ int new_size = c->size * 2;
+ AOM_CHECK_MEM_ERROR(c->error, new_buf,
+ aom_malloc(new_size * sizeof(*new_buf)));
+ memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
+ aom_free(c->buf);
+ c->buf = new_buf;
+ c->size = new_size;
+}
+#endif
+
+void aom_buf_ans_flush(struct BufAnsCoder *const c) {
+ int offset;
+#if ANS_MAX_SYMBOLS
+ if (c->offset == 0) return;
+#endif
+ assert(c->offset > 0);
+ offset = c->offset - 1;
+ // Code the first symbol such that it brings the state to the smallest normal
+ // state from an initial state that would have been a subnormal/refill state.
+ if (c->buf[offset].method == ANS_METHOD_RANS) {
+ c->ans.state += c->buf[offset].val_start;
+ } else {
+ c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0;
+ }
+ for (offset = offset - 1; offset >= 0; --offset) {
+ if (c->buf[offset].method == ANS_METHOD_RANS) {
+ rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob);
+ } else {
+ rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start,
+ (AnsP8)c->buf[offset].prob);
+ }
+ }
+ c->offset = 0;
+ c->output_bytes += ans_write_end(&c->ans);
+}
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
new file mode 100644
index 000000000..0768506b3
--- /dev/null
+++ b/third_party/aom/aom_dsp/buf_ans.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BUF_ANS_H_
+#define AOM_DSP_BUF_ANS_H_
+// Buffered forward ANS writer.
+// Symbols are written to the writer in forward (decode) order and serialized
+// backwards due to ANS's stack like behavior.
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/ans.h"
+#include "aom_dsp/answriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define ANS_METHOD_RABS 0
+#define ANS_METHOD_RANS 1
+
+struct buffered_ans_symbol {
+ unsigned int method : 1; // one of ANS_METHOD_RABS or ANS_METHOD_RANS
+ // TODO(aconverse): Should be possible to write this in terms of start for ABS
+ unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS
+ // start in symbol cycle for Rans
+ unsigned int prob : RANS_PROB_BITS; // Probability of this symbol
+};
+
+struct BufAnsCoder {
+ struct aom_internal_error_info *error;
+ struct buffered_ans_symbol *buf;
+ struct AnsCoder ans;
+ int size;
+ int offset;
+ int output_bytes;
+#if ANS_MAX_SYMBOLS
+ int window_size;
+#endif
+};
+
+// Allocate a buffered ANS coder to store size symbols.
+// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS
+// partition.
+// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
+// buffer will grow on demand
+void aom_buf_ans_alloc(struct BufAnsCoder *c,
+ struct aom_internal_error_info *error, int hint);
+
+void aom_buf_ans_free(struct BufAnsCoder *c);
+
+#if !ANS_MAX_SYMBOLS
+void aom_buf_ans_grow(struct BufAnsCoder *c);
+#endif
+
+void aom_buf_ans_flush(struct BufAnsCoder *const c);
+
+static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
+ uint8_t *const output_buffer) {
+ c->offset = 0;
+ c->output_bytes = 0;
+ ans_write_init(&c->ans, output_buffer);
+}
+
+static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val,
+ AnsP8 prob) {
+ assert(c->offset <= c->size);
+#if !ANS_MAX_SYMBOLS
+ if (c->offset == c->size) {
+ aom_buf_ans_grow(c);
+ }
+#endif
+ c->buf[c->offset].method = ANS_METHOD_RABS;
+ c->buf[c->offset].val_start = val;
+ c->buf[c->offset].prob = prob;
+ ++c->offset;
+#if ANS_MAX_SYMBOLS
+ if (c->offset == c->size) aom_buf_ans_flush(c);
+#endif
+}
+
+// Buffer one symbol for encoding using rANS.
+// cum_prob: The cumulative probability before this symbol (the offset of
+// the symbol in the symbol cycle)
+// prob: The probability of this symbol (l_s from the paper)
+// RANS_PRECISION takes the place of m from the paper.
+static INLINE void buf_rans_write(struct BufAnsCoder *const c,
+ aom_cdf_prob cum_prob, aom_cdf_prob prob) {
+ assert(c->offset <= c->size);
+#if !ANS_MAX_SYMBOLS
+ if (c->offset == c->size) {
+ aom_buf_ans_grow(c);
+ }
+#endif
+ c->buf[c->offset].method = ANS_METHOD_RANS;
+ c->buf[c->offset].val_start = cum_prob;
+ c->buf[c->offset].prob = prob;
+ ++c->offset;
+#if ANS_MAX_SYMBOLS
+ if (c->offset == c->size) aom_buf_ans_flush(c);
+#endif
+}
+
+static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) {
+ buf_rabs_write(c, bit, 128);
+}
+
+static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal,
+ int bits) {
+ int bit;
+
+ assert(bits < 31);
+ for (bit = bits - 1; bit >= 0; bit--)
+ buf_rabs_write_bit(c, 1 & (literal >> bit));
+}
+
+static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
+ assert(c->offset == 0);
+ return c->output_bytes;
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_DSP_BUF_ANS_H_
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
new file mode 100644
index 000000000..0fc7b14a5
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/daalaboolreader.h"
+
+int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
+ if (size && !buffer) {
+ return 1;
+ }
+ r->buffer_end = buffer + size;
+ r->buffer = buffer;
+ od_ec_dec_init(&r->ec, buffer, size - 1);
+#if CONFIG_ACCOUNTING
+ r->accounting = NULL;
+#endif
+ return 0;
+}
+
+const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
+ return r->buffer_end;
+}
+
+uint32_t aom_daala_reader_tell(const daala_reader *r) {
+ return od_ec_dec_tell(&r->ec);
+}
+
+uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
+ return od_ec_dec_tell_frac(&r->ec);
+}
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
new file mode 100644
index 000000000..428d74db0
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_DAALABOOLREADER_H_
+#define AOM_DSP_DAALABOOLREADER_H_
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_BITSTREAM_DEBUG
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct daala_reader {
+ const uint8_t *buffer;
+ const uint8_t *buffer_end;
+ od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+};
+
+typedef struct daala_reader daala_reader;
+
+int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
+const uint8_t *aom_daala_reader_find_end(daala_reader *r);
+uint32_t aom_daala_reader_tell(const daala_reader *r);
+uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
+
+static INLINE int aom_daala_read(daala_reader *r, int prob) {
+ int bit;
+#if CONFIG_EC_SMALLMUL
+ int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+#else
+ int p = ((prob << 15) + 256 - prob) >> 8;
+#endif
+#if CONFIG_BITSTREAM_DEBUG
+/*{
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = bitstream_queue_get_frame_read();
+ if (frame_idx == 0 && queue_r == 0) {
+ fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n",
+ frame_idx, queue_r);
+ }
+}*/
+#endif
+
+ bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ int i;
+ int ref_bit, ref_nsymbs;
+ aom_cdf_prob ref_cdf[16];
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = bitstream_queue_get_frame_read();
+ bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+ if (ref_nsymbs != 2) {
+ fprintf(stderr,
+ "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+ "%d queue_r %d\n",
+ frame_idx, 2, ref_nsymbs, queue_r);
+ assert(0);
+ }
+ if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+ (ref_cdf[1] != 32767)) {
+ fprintf(stderr,
+ "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+ frame_idx, p, 32767, ref_cdf[0]);
+ for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+ fprintf(stderr, "} queue_r %d\n", queue_r);
+ assert(0);
+ }
+ if (bit != ref_bit) {
+ fprintf(stderr,
+ "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+ "queue_r %d\n",
+ frame_idx, bit, ref_bit, queue_r);
+ assert(0);
+ }
+ }
+#endif
+
+ return bit;
+}
+
+#if CONFIG_RAWBITS
+static INLINE int aom_daala_read_bit(daala_reader *r) {
+ return od_ec_dec_bits(&r->ec, 1, "aom_bits");
+}
+#endif
+
+static INLINE int aom_daala_reader_has_error(daala_reader *r) {
+ return r->ec.error;
+}
+
+static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
+ int nsymbs) {
+ int symb;
+ symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ int i;
+ int cdf_error = 0;
+ int ref_symb, ref_nsymbs;
+ aom_cdf_prob ref_cdf[16];
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = bitstream_queue_get_frame_read();
+ bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+ if (nsymbs != ref_nsymbs) {
+ fprintf(stderr,
+ "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+ "queue_r %d\n",
+ frame_idx, nsymbs, ref_nsymbs, queue_r);
+ cdf_error = 0;
+ assert(0);
+ } else {
+ for (i = 0; i < nsymbs; ++i)
+ if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+ }
+ if (cdf_error) {
+ fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+ cdf[0]);
+ for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+ fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+ for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+ fprintf(stderr, "} queue_r %d\n", queue_r);
+ assert(0);
+ }
+ if (symb != ref_symb) {
+ fprintf(
+ stderr,
+ "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+ frame_idx, symb, ref_symb, queue_r);
+ assert(0);
+ }
+ }
+#endif
+
+ return symb;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
new file mode 100644
index 000000000..0ba8f6ab8
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolwriter.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/daalaboolwriter.h"
+
+void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
+ br->buffer = source;
+ br->pos = 0;
+ od_ec_enc_init(&br->ec, 62025);
+}
+
+void aom_daala_stop_encode(daala_writer *br) {
+ uint32_t daala_bytes;
+ unsigned char *daala_data;
+ daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
+ memcpy(br->buffer, daala_data, daala_bytes);
+ br->pos = daala_bytes;
+ /* Prevent ec bitstream from being detected as a superframe marker.
+ Must always be added, so that rawbits knows the exact length of the
+ bitstream. */
+ br->buffer[br->pos++] = 0;
+ od_ec_enc_clear(&br->ec);
+}
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
new file mode 100644
index 000000000..bbaf53c69
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_DAALABOOLWRITER_H_
+#define AOM_DSP_DAALABOOLWRITER_H_
+
+#include <stdio.h>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct daala_writer {
+ unsigned int pos;
+ uint8_t *buffer;
+ od_ec_enc ec;
+};
+
+typedef struct daala_writer daala_writer;
+
+void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
+void aom_daala_stop_encode(daala_writer *w);
+
+static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
+#if CONFIG_EC_SMALLMUL
+ int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+#else
+ int p = ((prob << 15) + 256 - prob) >> 8;
+#endif
+#if CONFIG_BITSTREAM_DEBUG
+ aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+ /*int queue_r = 0;
+ int frame_idx_r = 0;
+ int queue_w = bitstream_queue_get_write();
+ int frame_idx_w = bitstream_queue_get_frame_write();
+ if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+ fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+ frame_idx_w, queue_w);
+ }*/
+ bitstream_queue_push(bit, cdf, 2);
+#endif
+
+ od_ec_encode_bool_q15(&w->ec, bit, p);
+}
+
+#if CONFIG_RAWBITS
+static INLINE void aom_daala_write_bit(daala_writer *w, int bit) {
+ od_ec_enc_bits(&w->ec, bit, 1);
+}
+#endif
+
+static INLINE void daala_write_symbol(daala_writer *w, int symb,
+ const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_BITSTREAM_DEBUG
+ /*int queue_r = 0;
+ int frame_idx_r = 0;
+ int queue_w = bitstream_queue_get_write();
+ int frame_idx_w = bitstream_queue_get_frame_write();
+ if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+ fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+ frame_idx_w, queue_w);
+ }*/
+ bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+ od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/dkboolreader.c b/third_party/aom/aom_dsp/dkboolreader.c
new file mode 100644
index 000000000..288d5f1ce
--- /dev/null
+++ b/third_party/aom/aom_dsp/dkboolreader.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_config.h"
+
+#include "aom_dsp/dkboolreader.h"
+#include "aom_dsp/prob.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_util/endian_inl.h"
+
+static INLINE int aom_dk_read_bit(struct aom_dk_reader *r) {
+ return aom_dk_read(r, 128); // aom_prob_half
+}
+
+int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
+ size_t size, aom_decrypt_cb decrypt_cb,
+ void *decrypt_state) {
+ if (size && !buffer) {
+ return 1;
+ } else {
+ r->buffer_end = buffer + size;
+ r->buffer_start = r->buffer = buffer;
+ r->value = 0;
+ r->count = -8;
+ r->range = 255;
+ r->decrypt_cb = decrypt_cb;
+ r->decrypt_state = decrypt_state;
+ aom_dk_reader_fill(r);
+#if CONFIG_ACCOUNTING
+ r->accounting = NULL;
+#endif
+ return aom_dk_read_bit(r) != 0; // marker bit
+ }
+}
+
+void aom_dk_reader_fill(struct aom_dk_reader *r) {
+ const uint8_t *const buffer_end = r->buffer_end;
+ const uint8_t *buffer = r->buffer;
+ const uint8_t *buffer_start = buffer;
+ BD_VALUE value = r->value;
+ int count = r->count;
+ const size_t bytes_left = buffer_end - buffer;
+ const size_t bits_left = bytes_left * CHAR_BIT;
+ int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+
+ if (r->decrypt_cb) {
+ size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left);
+ r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
+ buffer = r->clear_buffer;
+ buffer_start = r->clear_buffer;
+ }
+ if (bits_left > BD_VALUE_SIZE) {
+ const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+ BD_VALUE nv;
+ BD_VALUE big_endian_values;
+ memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+ big_endian_values = HToBE64(big_endian_values);
+#else
+ big_endian_values = HToBE32(big_endian_values);
+#endif
+ nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+ count += bits;
+ buffer += (bits >> 3);
+ value = r->value | (nv << (shift & 0x7));
+ } else {
+ const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
+ int loop_end = 0;
+ if (bits_over >= 0) {
+ count += LOTS_OF_BITS;
+ loop_end = bits_over;
+ }
+
+ if (bits_over < 0 || bits_left) {
+ while (shift >= loop_end) {
+ count += CHAR_BIT;
+ value |= (BD_VALUE)*buffer++ << shift;
+ shift -= CHAR_BIT;
+ }
+ }
+ }
+
+ // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
+ // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
+ // assign 'buffer' to 'r->buffer'.
+ r->buffer += buffer - buffer_start;
+ r->value = value;
+ r->count = count;
+}
+
+const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r) {
+ // Find the end of the coded buffer
+ while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
+ r->count -= CHAR_BIT;
+ r->buffer--;
+ }
+ return r->buffer;
+}
diff --git a/third_party/aom/aom_dsp/dkboolreader.h b/third_party/aom/aom_dsp/dkboolreader.h
new file mode 100644
index 000000000..f0bc84381
--- /dev/null
+++ b/third_party/aom/aom_dsp/dkboolreader.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_DKBOOLREADER_H_
+#define AOM_DSP_DKBOOLREADER_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <limits.h>
+
+#include "./aom_config.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include <assert.h>
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "aom_ports/mem.h"
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define LOTS_OF_BITS 0x40000000
+
+struct aom_dk_reader {
+ // Be careful when reordering this struct, it may impact the cache negatively.
+ BD_VALUE value;
+ unsigned int range;
+ int count;
+ const uint8_t *buffer_start;
+ const uint8_t *buffer_end;
+ const uint8_t *buffer;
+ aom_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+ uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+};
+
+int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
+ size_t size, aom_decrypt_cb decrypt_cb,
+ void *decrypt_state);
+
+void aom_dk_reader_fill(struct aom_dk_reader *r);
+
+const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r);
+
+static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) {
+ const uint32_t bits_read =
+ (uint32_t)((r->buffer - r->buffer_start) * CHAR_BIT);
+ const int count =
+ (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS;
+ assert(r->buffer >= r->buffer_start);
+ return bits_read - (count + CHAR_BIT);
+}
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+ 3 => 1/8th bits.*/
+#define DK_BITRES (3)
+
+static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) {
+ uint32_t num_bits;
+ uint32_t range;
+ int l;
+ int i;
+ num_bits = aom_dk_reader_tell(r) << DK_BITRES;
+ range = r->range;
+ l = 0;
+ for (i = DK_BITRES; i-- > 0;) {
+ int b;
+ range = range * range >> 7;
+ b = (int)(range >> 8);
+ l = l << 1 | b;
+ range >>= b;
+ }
+ return num_bits - l;
+}
+
+static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) {
+ // Check if we have reached the end of the buffer.
+ //
+ // Variable 'count' stores the number of bits in the 'value' buffer, minus
+ // 8. The top byte is part of the algorithm, and the remainder is buffered
+ // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ // occupied, 8 for the algorithm and 8 in the buffer.
+ //
+ // When reading a byte from the user's buffer, count is filled with 8 and
+ // one byte is filled into the value buffer. When we reach the end of the
+ // data, count is additionally filled with LOTS_OF_BITS. So when
+ // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
+ //
+ // 1 if we have tried to decode bits after the end of stream was encountered.
+ // 0 No error.
+ return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
+}
+
+static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) {
+ unsigned int bit = 0;
+ BD_VALUE value;
+ BD_VALUE bigsplit;
+ int count;
+ unsigned int range;
+ unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
+
+ if (r->count < 0) aom_dk_reader_fill(r);
+
+ value = r->value;
+ count = r->count;
+
+ bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+ range = split;
+
+ if (value >= bigsplit) {
+ range = r->range - split;
+ value = value - bigsplit;
+ bit = 1;
+ }
+
+ {
+ register int shift = aom_norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ r->value = value;
+ r->count = count;
+ r->range = range;
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ int ref_bit, ref_prob;
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = bitstream_queue_get_frame_read();
+ bitstream_queue_pop(&ref_bit, &ref_prob);
+ if (prob != ref_prob) {
+ fprintf(
+ stderr,
+ "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
+ frame_idx, prob, ref_prob, queue_r);
+ assert(0);
+ }
+ if ((int)bit != ref_bit) {
+ fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
+ frame_idx, bit, ref_bit);
+ assert(0);
+ }
+ }
+#endif // CONFIG_BITSTREAM_DEBUG
+
+ return bit;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_DKBOOLREADER_H_
diff --git a/third_party/aom/aom_dsp/dkboolwriter.c b/third_party/aom/aom_dsp/dkboolwriter.c
new file mode 100644
index 000000000..fc98e7c9b
--- /dev/null
+++ b/third_party/aom/aom_dsp/dkboolwriter.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./dkboolwriter.h"
+
+static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) {
+ aom_dk_write(w, bit, 128); // aom_prob_half
+}
+
+void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) {
+ br->lowvalue = 0;
+ br->range = 255;
+ br->count = -24;
+ br->buffer = source;
+ br->pos = 0;
+ aom_dk_write_bit(br, 0);
+}
+
+void aom_dk_stop_encode(aom_dk_writer *br) {
+ int i;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_set_skip_write(1);
+#endif // CONFIG_BITSTREAM_DEBUG
+
+ for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0);
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_set_skip_write(0);
+#endif // CONFIG_BITSTREAM_DEBUG
+
+ // Ensure there's no ambigous collision with any index marker bytes
+ if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
+}
diff --git a/third_party/aom/aom_dsp/dkboolwriter.h b/third_party/aom/aom_dsp/dkboolwriter.h
new file mode 100644
index 000000000..835436885
--- /dev/null
+++ b/third_party/aom/aom_dsp/dkboolwriter.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_DKBOOLWRITER_H_
+#define AOM_DSP_DKBOOLWRITER_H_
+
+#include "./aom_config.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "aom_dsp/prob.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct aom_dk_writer {
+ unsigned int lowvalue;
+ unsigned int range;
+ int count;
+ unsigned int pos;
+ uint8_t *buffer;
+} aom_dk_writer;
+
+void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer);
+void aom_dk_stop_encode(aom_dk_writer *bc);
+
+static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) {
+ unsigned int split;
+ int count = br->count;
+ unsigned int range = br->range;
+ unsigned int lowvalue = br->lowvalue;
+ register int shift;
+
+#if CONFIG_BITSTREAM_DEBUG
+ // int queue_r = 0;
+ // int frame_idx_r = 0;
+ // int queue_w = bitstream_queue_get_write();
+ // int frame_idx_w = bitstream_queue_get_frame_write();
+ // if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+ // fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+ // frame_idx_w, queue_w);
+ // }
+ bitstream_queue_push(bit, probability);
+#endif // CONFIG_BITSTREAM_DEBUG
+
+ split = 1 + (((range - 1) * probability) >> 8);
+
+ range = split;
+
+ if (bit) {
+ lowvalue += split;
+ range = br->range - split;
+ }
+
+ shift = aom_norm[range];
+
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0) {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000) {
+ int x = br->pos - 1;
+
+ while (x >= 0 && br->buffer[x] == 0xff) {
+ br->buffer[x] = 0;
+ x--;
+ }
+
+ br->buffer[x] += 1;
+ }
+
+ br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8;
+ }
+
+ lowvalue <<= shift;
+ br->count = count;
+ br->lowvalue = lowvalue;
+ br->range = range;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_DKBOOLWRITER_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
new file mode 100644
index 000000000..ad76b7e3e
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "./config.h"
+#endif
+
+#include "aom_dsp/entcode.h"
+
+/*Given the current total integer number of bits used and the current value of
+ rng, computes the fraction number of bits used to OD_BITRES precision.
+ This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
+ nbits_total: The number of whole bits currently used, i.e., the value
+ returned by od_ec_enc_tell() or od_ec_dec_tell().
+ rng: The current value of rng from either the encoder or decoder state.
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
+ uint32_t nbits;
+ int l;
+ int i;
+ /*To handle the non-integral number of bits still left in the encoder/decoder
+ state, we compute the worst-case number of bits of val that must be
+ encoded to ensure that the value is inside the range for any possible
+ subsequent bits.
+ The computation here is independent of val itself (the decoder does not
+ even track that value), even though the real number of bits used after
+ od_ec_enc_done() may be 1 smaller if rng is a power of two and the
+ corresponding trailing bits of val are all zeros.
+ If we did try to track that special case, then coding a value with a
+ probability of 1/(1 << n) might sometimes appear to use more than n bits.
+ This may help explain the surprising result that a newly initialized
+ encoder or decoder claims to have used 1 bit.*/
+ nbits = nbits_total << OD_BITRES;
+ l = 0;
+ for (i = OD_BITRES; i-- > 0;) {
+ int b;
+ rng = rng * rng >> 15;
+ b = (int)(rng >> 16);
+ l = l << 1 | b;
+ rng >>= b;
+ }
+ return nbits - l;
+}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
new file mode 100644
index 000000000..534959e66
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_entcode_H)
+#define _entcode_H (1)
+#include <limits.h>
+#include <stddef.h>
+#include "av1/common/odintrin.h"
+
+/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
+ on a larger type, you can speed up the decoder by using it here.*/
+typedef uint32_t od_ec_window;
+
+#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
+
+/*The number of bits to use for the range-coded part of unsigned integers.*/
+#define OD_EC_UINT_BITS (4)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+ 3 => 1/8th bits.*/
+#define OD_BITRES (3)
+
+/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual
+ Q15 cumulative probability (an "inverse" CDF).
+ This function converts from one representation to the other (and is its own
+ inverse).*/
+#if CONFIG_EC_SMALLMUL
+#define OD_ICDF(x) (32768U - (x))
+#else
+#define OD_ICDF(x) (x)
+#endif
+
+/*See entcode.c for further documentation.*/
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
+ uint32_t rng);
+
+#endif
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
new file mode 100644
index 000000000..49b176cd8
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "./config.h"
+#endif
+
+#include "aom_dsp/entdec.h"
+
+/*A range decoder.
+ This is an entropy decoder based upon \cite{Mar79}, which is itself a
+ rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+ It is very similar to arithmetic encoding, except that encoding is done with
+ digits in any base, instead of with bits, and so it is faster when using
+ larger bases (i.e.: a byte).
+ The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+ is the base, longer than the theoretical optimum, but to my knowledge there
+ is no published justification for this claim.
+ This only seems true when using near-infinite precision arithmetic so that
+ the process is carried out with no rounding errors.
+
+ An excellent description of implementation details is available at
+ http://www.arturocampos.com/ac_range.html
+ A recent work \cite{MNW98} which proposes several changes to arithmetic
+ encoding for efficiency actually re-discovers many of the principles
+ behind range encoding, and presents a good theoretical analysis of them.
+
+ End of stream is handled by writing out the smallest number of bits that
+ ensures that the stream will be correctly decoded regardless of the value of
+ any subsequent bits.
+ od_ec_dec_tell() can be used to determine how many bits were needed to decode
+ all the symbols thus far; other data can be packed in the remaining bits of
+ the input buffer.
+ @PHDTHESIS{Pas76,
+ author="Richard Clark Pasco",
+ title="Source coding algorithms for fast data compression",
+ school="Dept. of Electrical Engineering, Stanford University",
+ address="Stanford, CA",
+ month=May,
+ year=1976,
+ URL="http://www.richpasco.org/scaffdc.pdf"
+ }
+ @INPROCEEDINGS{Mar79,
+ author="Martin, G.N.N.",
+ title="Range encoding: an algorithm for removing redundancy from a digitised
+ message",
+ booktitle="Video & Data Recording Conference",
+ year=1979,
+ address="Southampton",
+ month=Jul,
+ URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+ }
+ @ARTICLE{MNW98,
+ author="Alistair Moffat and Radford Neal and Ian H. Witten",
+ title="Arithmetic Coding Revisited",
+ journal="{ACM} Transactions on Information Systems",
+ year=1998,
+ volume=16,
+ number=3,
+ pages="256--294",
+ month=Jul,
+ URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+ }*/
+
+/*This is meant to be a large, positive constant that can still be efficiently
+ loaded as an immediate (on platforms like ARM, for example).
+ Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+static void od_ec_dec_refill(od_ec_dec *dec) {
+ int s;
+ od_ec_window dif;
+ int16_t cnt;
+ const unsigned char *bptr;
+ const unsigned char *end;
+ dif = dec->dif;
+ cnt = dec->cnt;
+ bptr = dec->bptr;
+ end = dec->end;
+ s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+ for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+ OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
+ dif ^= (od_ec_window)bptr[0] << s;
+ cnt += 8;
+ }
+ if (bptr >= end) {
+ dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+ cnt = OD_EC_LOTS_OF_BITS;
+ }
+ dec->dif = dif;
+ dec->cnt = cnt;
+ dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+ 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ necessary), and stores them back in the decoder context.
+ dif: The new value of dif.
+ rng: The new value of the range.
+ ret: The value to return.
+ Return: ret.
+ This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+ int ret) {
+ int d;
+ OD_ASSERT(rng <= 65535U);
+ d = 16 - OD_ILOG_NZ(rng);
+ dec->cnt -= d;
+#if CONFIG_EC_SMALLMUL
+ /*This is equivalent to shifting in 1's instead of 0's.*/
+ dec->dif = ((dif + 1) << d) - 1;
+#else
+ dec->dif = dif << d;
+#endif
+ dec->rng = rng << d;
+ if (dec->cnt < 0) od_ec_dec_refill(dec);
+ return ret;
+}
+
+/*Initializes the decoder.
+ buf: The input buffer to use.
+ Return: 0 on success, or a negative value on error.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+ uint32_t storage) {
+ dec->buf = buf;
+ dec->eptr = buf + storage;
+ dec->end_window = 0;
+ dec->nend_bits = 0;
+ dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+ dec->end = buf + storage;
+ dec->bptr = buf;
+#if CONFIG_EC_SMALLMUL
+ dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
+#else
+ dec->dif = 0;
+#endif
+ dec->rng = 0x8000;
+ dec->cnt = -15;
+ dec->error = 0;
+ od_ec_dec_refill(dec);
+}
+
+/*Decode a single binary value.
+ {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768.
+ {else} f: The probability that the bit is zero, scaled by 32768.
+ Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
+ od_ec_window dif;
+ od_ec_window vw;
+ unsigned r;
+ unsigned r_new;
+ unsigned v;
+ int ret;
+ OD_ASSERT(0 < f);
+ OD_ASSERT(f < 32768U);
+ dif = dec->dif;
+ r = dec->rng;
+ OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+ OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+ v = (r >> 8) * (uint32_t)f >> 7;
+ vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+ ret = 1;
+ r_new = v;
+ if (dif >= vw) {
+ r_new = r - v;
+ dif -= vw;
+ ret = 0;
+ }
+#else
+ v = f * (uint32_t)r >> 15;
+ vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+ ret = 0;
+ r_new = v;
+ if (dif >= vw) {
+ r_new = r - v;
+ dif -= vw;
+ ret = 1;
+ }
+#endif
+ return od_ec_dec_normalize(dec, dif, r_new, ret);
+}
+
+/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
+ cdf: The CDF, such that symbol s falls in the range
+ [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+ The values must be monotonically non-increasing, and cdf[nsyms - 1]
+ must be 32768.
+ {EC_SMALLMUL}: The CDF contains 32768 minus those values.
+ nsyms: The number of symbols in the alphabet.
+ This should be at most 16.
+ Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
+ od_ec_window dif;
+ unsigned r;
+ unsigned c;
+ unsigned u;
+ unsigned v;
+ int ret;
+ (void)nsyms;
+ dif = dec->dif;
+ r = dec->rng;
+ OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+ OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U));
+ OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+ c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+ v = r;
+ ret = -1;
+ do {
+ u = v;
+ v = (r >> 8) * (uint32_t)cdf[++ret] >> 7;
+ } while (c < v);
+ OD_ASSERT(v < u);
+ OD_ASSERT(u <= r);
+ r = u - v;
+ dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+#else
+ c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+ v = 0;
+ ret = -1;
+ do {
+ u = v;
+ v = cdf[++ret] * (uint32_t)r >> 15;
+ } while (v <= c);
+ OD_ASSERT(u < v);
+ OD_ASSERT(v <= r);
+ r = v - u;
+ dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
+#endif
+ return od_ec_dec_normalize(dec, dif, r, ret);
+}
+
+#if CONFIG_RAWBITS
+/*Extracts a sequence of raw bits from the stream.
+ The bits must have been encoded with od_ec_enc_bits().
+ ftb: The number of bits to extract.
+ This must be between 0 and 25, inclusive.
+ Return: The decoded bits.*/
+uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
+ od_ec_window window;
+ int available;
+ uint32_t ret;
+ OD_ASSERT(ftb <= 25);
+ window = dec->end_window;
+ available = dec->nend_bits;
+ if ((unsigned)available < ftb) {
+ const unsigned char *buf;
+ const unsigned char *eptr;
+ buf = dec->buf;
+ eptr = dec->eptr;
+ OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
+ do {
+ if (eptr <= buf) {
+ dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
+ available = OD_EC_LOTS_OF_BITS;
+ break;
+ }
+ window |= (od_ec_window) * --eptr << available;
+ available += 8;
+ } while (available <= OD_EC_WINDOW_SIZE - 8);
+ dec->eptr = eptr;
+ }
+ ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
+ window >>= ftb;
+ available -= ftb;
+ dec->end_window = window;
+ dec->nend_bits = available;
+ return ret;
+}
+#endif
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Return: The number of bits.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+int od_ec_dec_tell(const od_ec_dec *dec) {
+ return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 -
+ dec->cnt - dec->nend_bits + dec->tell_offs);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
+ return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
new file mode 100644
index 000000000..e1145e81d
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_entdec_H)
+#define _entdec_H (1)
+#include <limits.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_dec od_ec_dec;
+
+#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#else
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
+#endif
+
+/*The entropy decoder context.*/
+struct od_ec_dec {
+ /*The start of the current input buffer.*/
+ const unsigned char *buf;
+ /*The read pointer for the raw bits.*/
+ const unsigned char *eptr;
+ /*Bits that will be read from/written at the end.*/
+ od_ec_window end_window;
+ /*Number of valid bits in end_window.*/
+ int nend_bits;
+ /*An offset used to keep track of tell after reaching the end of the stream.
+ This is constant throughout most of the decoding process, but becomes
+ important once we hit the end of the buffer and stop incrementing pointers
+ (and instead pretend cnt/nend_bits have lots of bits).*/
+ int32_t tell_offs;
+ /*The end of the current input buffer.*/
+ const unsigned char *end;
+ /*The read pointer for the entropy-coded bits.*/
+ const unsigned char *bptr;
+ /*The difference between the coded value and the low end of the current
+ range.
+ {EC_SMALLMUL} The difference between the high end of the current range,
+ (low + rng), and the coded value, minus 1.
+ This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
+ decoder only uses the top 16 bits of the window to decode the next symbol.
+ As we shift up during renormalization, if we don't have enough bits left in
+ the window to fill the top 16, we'll read in more bits of the coded
+ value.*/
+ od_ec_window dif;
+ /*The number of values in the current range.*/
+ uint16_t rng;
+ /*The number of bits of data in the current value.*/
+ int16_t cnt;
+ /*Nonzero if an error occurred.*/
+ int error;
+};
+
+/*See entdec.c for further documentation.*/
+
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
+ const uint16_t *cdf, int nsyms)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
+ OD_ARG_NONNULL(1);
+
+OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
+ OD_ARG_NONNULL(1);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
new file mode 100644
index 000000000..a350f27f4
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "./config.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include "aom_dsp/entenc.h"
+
+/*A range encoder.
+ See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+ @INPROCEEDINGS{Mar79,
+ author="Martin, G.N.N.",
+ title="Range encoding: an algorithm for removing redundancy from a digitised
+ message",
+ booktitle="Video \& Data Recording Conference",
+ year=1979,
+ address="Southampton",
+ month=Jul,
+ URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+ }
+ @ARTICLE{MNW98,
+ author="Alistair Moffat and Radford Neal and Ian H. Witten",
+ title="Arithmetic Coding Revisited",
+ journal="{ACM} Transactions on Information Systems",
+ year=1998,
+ volume=16,
+ number=3,
+ pages="256--294",
+ month=Jul,
+ URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+ }*/
+
+/*Takes updated low and range values, renormalizes them so that
+ 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
+ necessary), and stores them back in the encoder context.
+ low: The new value of low.
+ rng: The new value of the range.*/
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
+ unsigned rng) {
+ int d;
+ int c;
+ int s;
+ c = enc->cnt;
+ OD_ASSERT(rng <= 65535U);
+ d = 16 - OD_ILOG_NZ(rng);
+ s = c + d;
+ /*TODO: Right now we flush every time we have at least one byte available.
+ Instead we should use an od_ec_window and flush right before we're about to
+ shift bits off the end of the window.
+ For a 32-bit window this is about the same amount of work, but for a 64-bit
+ window it should be a fair win.*/
+ if (s >= 0) {
+ uint16_t *buf;
+ uint32_t storage;
+ uint32_t offs;
+ unsigned m;
+ buf = enc->precarry_buf;
+ storage = enc->precarry_storage;
+ offs = enc->offs;
+ if (offs + 2 > storage) {
+ storage = 2 * storage + 2;
+ buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+ if (buf == NULL) {
+ enc->error = -1;
+ enc->offs = 0;
+ return;
+ }
+ enc->precarry_buf = buf;
+ enc->precarry_storage = storage;
+ }
+ c += 16;
+ m = (1 << c) - 1;
+ if (s >= 8) {
+ OD_ASSERT(offs < storage);
+ buf[offs++] = (uint16_t)(low >> c);
+ low &= m;
+ c -= 8;
+ m >>= 8;
+ }
+ OD_ASSERT(offs < storage);
+ buf[offs++] = (uint16_t)(low >> c);
+ s = c + d - 24;
+ low &= m;
+ enc->offs = offs;
+ }
+ enc->low = low << d;
+ enc->rng = rng << d;
+ enc->cnt = s;
+}
+
+/*Initializes the encoder.
+ size: The initial size of the buffer, in bytes.*/
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
+ od_ec_enc_reset(enc);
+ enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
+ enc->storage = size;
+ if (size > 0 && enc->buf == NULL) {
+ enc->storage = 0;
+ enc->error = -1;
+ }
+ enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
+ enc->precarry_storage = size;
+ if (size > 0 && enc->precarry_buf == NULL) {
+ enc->precarry_storage = 0;
+ enc->error = -1;
+ }
+}
+
+/*Reinitializes the encoder.*/
+void od_ec_enc_reset(od_ec_enc *enc) {
+ enc->end_offs = 0;
+ enc->end_window = 0;
+ enc->nend_bits = 0;
+ enc->offs = 0;
+ enc->low = 0;
+ enc->rng = 0x8000;
+ /*This is initialized to -9 so that it crosses zero after we've accumulated
+ one byte + one carry bit.*/
+ enc->cnt = -9;
+ enc->error = 0;
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy = 0;
+ enc->nb_symbols = 0;
+#endif
+}
+
+/*Frees the buffers used by the encoder.*/
+void od_ec_enc_clear(od_ec_enc *enc) {
+ free(enc->precarry_buf);
+ free(enc->buf);
+}
+
+/*Encodes a symbol given its frequency in Q15.
+ fl: The cumulative frequency of all symbols that come before the one to be
+ encoded.
+ fh: The cumulative frequency of all symbols up to and including the one to
+ be encoded.
+ {EC_SMALLMUL} Both values are 32768 minus that.*/
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
+ od_ec_window l;
+ unsigned r;
+ unsigned u;
+ unsigned v;
+ l = enc->low;
+ r = enc->rng;
+ OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+ OD_ASSERT(fh < fl);
+ OD_ASSERT(fl <= 32768U);
+ if (fl < 32768U) {
+ u = (r >> 8) * (uint32_t)fl >> 7;
+ v = (r >> 8) * (uint32_t)fh >> 7;
+ l += r - u;
+ r = u - v;
+ } else {
+ r -= (r >> 8) * (uint32_t)fh >> 7;
+ }
+#else
+ OD_ASSERT(fl < fh);
+ OD_ASSERT(fh <= 32768U);
+ u = fl * (uint32_t)r >> 15;
+ v = fh * (uint32_t)r >> 15;
+ r = v - u;
+ l += u;
+#endif
+ od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.);
+ enc->nb_symbols++;
+#endif
+}
+
+/*Encode a single binary value.
+ val: The value to encode (0 or 1).
+ {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768.
+ {else} f: The probability that val is zero, scaled by 32768.*/
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
+ od_ec_window l;
+ unsigned r;
+ unsigned v;
+ OD_ASSERT(0 < f);
+ OD_ASSERT(f < 32768U);
+ l = enc->low;
+ r = enc->rng;
+ OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+ v = (r >> 8) * (uint32_t)f >> 7;
+ if (val) l += r - v;
+ r = val ? v : r - v;
+#else
+ v = f * (uint32_t)r >> 15;
+ if (val) l += v;
+ r = val ? r - v : v;
+#endif
+ od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy -=
+ OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.);
+ enc->nb_symbols++;
+#endif
+}
+
+/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
+ s: The index of the symbol to encode.
+ cdf: The CDF, such that symbol s falls in the range
+ [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+ The values must be monotonically non-decreasing, and the last value
+ must be exactly 32768.
+ nsyms: The number of symbols in the alphabet.
+ This should be at most 16.*/
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf,
+ int nsyms) {
+ (void)nsyms;
+ OD_ASSERT(s >= 0);
+ OD_ASSERT(s < nsyms);
+ OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U));
+ od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]);
+}
+
+#if CONFIG_RAWBITS
+/*Encodes a sequence of raw bits in the stream.
+ fl: The bits to encode.
+ ftb: The number of bits to encode.
+ This must be between 0 and 25, inclusive.*/
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
+ od_ec_window end_window;
+ int nend_bits;
+ OD_ASSERT(ftb <= 25);
+ OD_ASSERT(fl < (uint32_t)1 << ftb);
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy += ftb;
+#endif
+ end_window = enc->end_window;
+ nend_bits = enc->nend_bits;
+ if (nend_bits + ftb > OD_EC_WINDOW_SIZE) {
+ unsigned char *buf;
+ uint32_t storage;
+ uint32_t end_offs;
+ buf = enc->buf;
+ storage = enc->storage;
+ end_offs = enc->end_offs;
+ if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) {
+ unsigned char *new_buf;
+ uint32_t new_storage;
+ new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3);
+ new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage);
+ if (new_buf == NULL) {
+ enc->error = -1;
+ enc->end_offs = 0;
+ return;
+ }
+ OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs,
+ end_offs);
+ storage = new_storage;
+ free(buf);
+ enc->buf = buf = new_buf;
+ enc->storage = storage;
+ }
+ do {
+ OD_ASSERT(end_offs < storage);
+ buf[storage - ++end_offs] = (unsigned char)end_window;
+ end_window >>= 8;
+ nend_bits -= 8;
+ } while (nend_bits >= 8);
+ enc->end_offs = end_offs;
+ }
+ OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE);
+ end_window |= (od_ec_window)fl << nend_bits;
+ nend_bits += ftb;
+ enc->end_window = end_window;
+ enc->nend_bits = nend_bits;
+}
+#endif
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+ have already been encoded.
+ This makes it possible to have a few flags up front, where it is easy for
+ decoders to access them without parsing the whole stream, even if their
+ values are not determined until late in the encoding process, without having
+ to buffer all the intermediate symbols in the encoder.
+ In order for this to work, at least nbits bits must have already been encoded
+ using probabilities that are an exact power of two.
+ The encoder can verify the number of encoded bits is sufficient, but cannot
+ check this latter condition.
+ val: The bits to encode (in the least nbits significant bits).
+ They will be decoded in order from most-significant to least.
+ nbits: The number of bits to overwrite.
+ This must be no more than 8.*/
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
+ int shift;
+ unsigned mask;
+ OD_ASSERT(nbits >= 0);
+ OD_ASSERT(nbits <= 8);
+ OD_ASSERT(val < 1U << nbits);
+ shift = 8 - nbits;
+ mask = ((1U << nbits) - 1) << shift;
+ if (enc->offs > 0) {
+ /*The first byte has been finalized.*/
+ enc->precarry_buf[0] =
+ (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
+ } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
+ /*The first byte has yet to be output.*/
+ enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
+ (od_ec_window)val << (16 + enc->cnt + shift);
+ } else {
+ /*The encoder hasn't even encoded _nbits of data yet.*/
+ enc->error = -1;
+ }
+}
+
+#if OD_MEASURE_EC_OVERHEAD
+#include <stdio.h>
+#endif
+
+/*Indicates that there are no more symbols to encode.
+ All remaining output bytes are flushed to the output buffer.
+ od_ec_enc_reset() should be called before using the encoder again.
+ bytes: Returns the size of the encoded data in the returned buffer.
+ Return: A pointer to the start of the final buffer, or NULL if there was an
+ encoding error.*/
+unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
+ unsigned char *out;
+ uint32_t storage;
+ uint16_t *buf;
+ uint32_t offs;
+ uint32_t end_offs;
+ int nend_bits;
+ od_ec_window m;
+ od_ec_window e;
+ od_ec_window l;
+ unsigned r;
+ int c;
+ int s;
+ if (enc->error) return NULL;
+#if OD_MEASURE_EC_OVERHEAD
+ {
+ uint32_t tell;
+ /* Don't count the 1 bit we lose to raw bits as overhead. */
+ tell = od_ec_enc_tell(enc) - 1;
+ fprintf(stderr, "overhead: %f%%\n",
+ 100 * (tell - enc->entropy) / enc->entropy);
+ fprintf(stderr, "efficiency: %f bits/symbol\n",
+ (double)tell / enc->nb_symbols);
+ }
+#endif
+ /*We output the minimum number of bits that ensures that the symbols encoded
+ thus far will be decoded correctly regardless of the bits that follow.*/
+ l = enc->low;
+ r = enc->rng;
+ c = enc->cnt;
+ s = 9;
+ m = 0x7FFF;
+ e = (l + m) & ~m;
+ while ((e | m) >= l + r) {
+ s++;
+ m >>= 1;
+ e = (l + m) & ~m;
+ }
+ s += c;
+ offs = enc->offs;
+ buf = enc->precarry_buf;
+ if (s > 0) {
+ unsigned n;
+ storage = enc->precarry_storage;
+ if (offs + ((s + 7) >> 3) > storage) {
+ storage = storage * 2 + ((s + 7) >> 3);
+ buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+ if (buf == NULL) {
+ enc->error = -1;
+ return NULL;
+ }
+ enc->precarry_buf = buf;
+ enc->precarry_storage = storage;
+ }
+ n = (1 << (c + 16)) - 1;
+ do {
+ OD_ASSERT(offs < storage);
+ buf[offs++] = (uint16_t)(e >> (c + 16));
+ e &= n;
+ s -= 8;
+ c -= 8;
+ n >>= 8;
+ } while (s > 0);
+ }
+ /*Make sure there's enough room for the entropy-coded bits and the raw
+ bits.*/
+ out = enc->buf;
+ storage = enc->storage;
+ end_offs = enc->end_offs;
+ e = enc->end_window;
+ nend_bits = enc->nend_bits;
+ s = -s;
+ c = OD_MAXI((nend_bits - s + 7) >> 3, 0);
+ if (offs + end_offs + c > storage) {
+ storage = offs + end_offs + c;
+ out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+ if (out == NULL) {
+ enc->error = -1;
+ return NULL;
+ }
+ OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs);
+ enc->buf = out;
+ enc->storage = storage;
+ }
+ /*If we have buffered raw bits, flush them as well.*/
+ while (nend_bits > s) {
+ OD_ASSERT(end_offs < storage);
+ out[storage - ++end_offs] = (unsigned char)e;
+ e >>= 8;
+ nend_bits -= 8;
+ }
+ *nbytes = offs + end_offs;
+ /*Perform carry propagation.*/
+ OD_ASSERT(offs + end_offs <= storage);
+ out = out + storage - (offs + end_offs);
+ c = 0;
+ end_offs = offs;
+ while (offs > 0) {
+ offs--;
+ c = buf[offs] + c;
+ out[offs] = (unsigned char)c;
+ c >>= 8;
+ }
+ /*Add any remaining raw bits to the last byte.
+ There is guaranteed to be enough room, because nend_bits <= s.*/
+ OD_ASSERT(nend_bits <= 0 || end_offs > 0);
+ if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e;
+ /*Note: Unless there's an allocation error, if you keep encoding into the
+ current buffer and call this function again later, everything will work
+ just fine (you won't get a new packet out, but you will get a single
+ buffer with the new data appended to the old).
+ However, this function is O(N) where N is the amount of data coded so far,
+ so calling it more than once for a given packet is a bad idea.*/
+ return out;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Warning: The value returned by this function can decrease compared to an
+ earlier call, even after encoding more data, if there is an encoding error
+ (i.e., a failure to allocate enough space for the output buffer).
+ Return: The number of bits.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+int od_ec_enc_tell(const od_ec_enc *enc) {
+ /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
+ bit, which we reserve for terminating the stream.*/
+ return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Warning: The value returned by this function can decrease compared to an
+ earlier call, even after encoding more data, if there is an encoding error
+ (i.e., a failure to allocate enough space for the output buffer).
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
+ return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
+}
+
+/*Saves a entropy coder checkpoint to dst.
+ This allows an encoder to reverse a series of entropy coder
+ decisions if it decides that the information would have been
+ better coded some other way.*/
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
+ OD_COPY(dst, src, 1);
+}
+
+/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
+ This can only be used to restore from checkpoints earlier in the target
+ state's history: you can not switch backwards and forwards or otherwise
+ switch to a state which isn't a casual ancestor of the current state.
+ Restore is also incompatible with patching the initial bits, as the
+ changes will remain in the restored version.*/
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
+ unsigned char *buf;
+ uint32_t storage;
+ uint16_t *precarry_buf;
+ uint32_t precarry_storage;
+ OD_ASSERT(dst->storage >= src->storage);
+ OD_ASSERT(dst->precarry_storage >= src->precarry_storage);
+ buf = dst->buf;
+ storage = dst->storage;
+ precarry_buf = dst->precarry_buf;
+ precarry_storage = dst->precarry_storage;
+ OD_COPY(dst, src, 1);
+ dst->buf = buf;
+ dst->storage = storage;
+ dst->precarry_buf = precarry_buf;
+ dst->precarry_storage = precarry_storage;
+}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
new file mode 100644
index 000000000..314b36318
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_entenc_H)
+#define _entenc_H (1)
+#include <stddef.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_enc od_ec_enc;
+
+#define OD_MEASURE_EC_OVERHEAD (0)
+
+/*The entropy encoder context.*/
+struct od_ec_enc {
+ /*Buffered output.
+ This contains only the raw bits until the final call to od_ec_enc_done(),
+ where all the arithmetic-coded data gets prepended to it.*/
+ unsigned char *buf;
+ /*The size of the buffer.*/
+ uint32_t storage;
+ /*The offset at which the last byte containing raw bits was written.*/
+ uint32_t end_offs;
+ /*Bits that will be read from/written at the end.*/
+ od_ec_window end_window;
+ /*Number of valid bits in end_window.*/
+ int nend_bits;
+ /*A buffer for output bytes with their associated carry flags.*/
+ uint16_t *precarry_buf;
+ /*The size of the pre-carry buffer.*/
+ uint32_t precarry_storage;
+ /*The offset at which the next entropy-coded byte will be written.*/
+ uint32_t offs;
+ /*The low end of the current range.*/
+ od_ec_window low;
+ /*The number of values in the current range.*/
+ uint16_t rng;
+ /*The number of bits of data in the current value.*/
+ int16_t cnt;
+ /*Nonzero if an error occurred.*/
+ int error;
+#if OD_MEASURE_EC_OVERHEAD
+ double entropy;
+ int nb_symbols;
+#endif
+};
+
+/*See entenc.c for further documentation.*/
+
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
+void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
+void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
+
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
+ OD_ARG_NONNULL(1);
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
+
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
+ OD_ARG_NONNULL(1);
+
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
+ uint32_t *nbytes)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
+ OD_ARG_NONNULL(1);
+
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
new file mode 100644
index 000000000..09d945afc
--- /dev/null
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * This code was originally written by: Nathan E. Egge, at the Daala
+ * project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#if CONFIG_HIGHBITDEPTH
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#endif
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ int w;
+ int h;
+};
+
+struct fs_ctx {
+ fs_level *level;
+ int nlevels;
+ unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+ unsigned char *data;
+ size_t data_size;
+ int lw;
+ int lh;
+ int l;
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ data_size =
+ _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size += im_size;
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ data_size += level_size;
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ data = (unsigned char *)malloc(data_size);
+ _ctx->level = (fs_level *)data;
+ _ctx->nlevels = _nlevels;
+ data += _nlevels * sizeof(*_ctx->level);
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ _ctx->level[l].w = lw;
+ _ctx->level[l].h = lh;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ _ctx->level[l].im1 = (uint32_t *)data;
+ _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+ data += level_size;
+ _ctx->level[l].ssim = (double *)data;
+ data += im_size * sizeof(*_ctx->level[l].ssim);
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ _ctx->col_buf = (unsigned *)data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+ const uint32_t *src1;
+ const uint32_t *src2;
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w2;
+ int h2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ dst1 = _ctx->level[_l].im1;
+ dst2 = _ctx->level[_l].im2;
+ w2 = _ctx->level[_l - 1].w;
+ h2 = _ctx->level[_l - 1].h;
+ src1 = _ctx->level[_l - 1].im1;
+ src2 = _ctx->level[_l - 1].im2;
+ for (j = 0; j < h; j++) {
+ int j0offs;
+ int j1offs;
+ j0offs = 2 * j * w2;
+ j1offs = FS_MINI(2 * j + 1, h2) * w2;
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, w2);
+ dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+ src1[j1offs + i0] + src1[j1offs + i1];
+ dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+ src2[j1offs + i0] + src2[j1offs + i1];
+ }
+ }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+ int _s1ystride, const uint8_t *_src2,
+ int _s2ystride, int _w, int _h, uint32_t bd,
+ uint32_t shift) {
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[0].w;
+ h = _ctx->level[0].h;
+ dst1 = _ctx->level[0].im1;
+ dst2 = _ctx->level[0].im2;
+ for (j = 0; j < h; j++) {
+ int j0;
+ int j1;
+ j0 = 2 * j;
+ j1 = FS_MINI(j0 + 1, _h);
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, _w);
+ if (bd == 8 && shift == 0) {
+ dst1[j * w + i] =
+ _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+ _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+ dst2[j * w + i] =
+ _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+ _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+ } else {
+ uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+ uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+ dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+ (src1s[j0 * _s1ystride + i1] >> shift) +
+ (src1s[j1 * _s1ystride + i0] >> shift) +
+ (src1s[j1 * _s1ystride + i1] >> shift);
+ dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+ (src2s[j0 * _s2ystride + i1] >> shift) +
+ (src2s[j1 * _s2ystride + i0] >> shift) +
+ (src2s[j1 * _s2ystride + i1] >> shift);
+ }
+ }
+ }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+ unsigned *col_sums_x;
+ unsigned *col_sums_y;
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ double c1;
+ int w;
+ int h;
+ int j0offs;
+ int j1offs;
+ int i;
+ int j;
+ double ssim_c1 = SSIM_C1;
+#if CONFIG_HIGHBITDEPTH
+ if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+ if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+#else
+ assert(bit_depth == 8);
+ (void)bit_depth;
+#endif
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ col_sums_x = _ctx->col_buf;
+ col_sums_y = col_sums_x + w;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+ for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+ for (j = 1; j < 4; j++) {
+ j1offs = FS_MINI(j, h - 1) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+ }
+ ssim = _ctx->level[_l].ssim;
+ c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+ for (j = 0; j < h; j++) {
+ unsigned mux;
+ unsigned muy;
+ int i0;
+ int i1;
+ mux = 5 * col_sums_x[0];
+ muy = 5 * col_sums_y[0];
+ for (i = 1; i < 4; i++) {
+ i1 = FS_MINI(i, w - 1);
+ mux += col_sums_x[i1];
+ muy += col_sums_y[i1];
+ }
+ for (i = 0; i < w; i++) {
+ ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+ (mux * (double)mux + muy * (double)muy + c1);
+ if (i + 1 < w) {
+ i0 = FS_MAXI(0, i - 4);
+ i1 = FS_MINI(i + 4, w - 1);
+ mux += col_sums_x[i1] - col_sums_x[i0];
+ muy += col_sums_x[i1] - col_sums_x[i0];
+ }
+ }
+ if (j + 1 < h) {
+ j0offs = FS_MAXI(0, j - 4) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+ j1offs = FS_MINI(j + 4, h - 1) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+ }
+ }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] = gx * (double)gx; \
+ col_sums_gy2[(_col)] = gy * (double)gy; \
+ col_sums_gxgy[(_col)] = gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] += gx * (double)gx; \
+ col_sums_gy2[(_col)] += gy * (double)gy; \
+ col_sums_gxgy[(_col)] += gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] -= gx * (double)gx; \
+ col_sums_gy2[(_col)] -= gy * (double)gy; \
+ col_sums_gxgy[(_col)] -= gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+ } while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+ } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+ } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+ uint32_t *im1;
+ uint32_t *im2;
+ unsigned *gx_buf;
+ unsigned *gy_buf;
+ double *ssim;
+ double col_sums_gx2[8];
+ double col_sums_gy2[8];
+ double col_sums_gxgy[8];
+ double c2;
+ int stride;
+ int w;
+ int h;
+ int i;
+ int j;
+ double ssim_c2 = SSIM_C2;
+#if CONFIG_HIGHBITDEPTH
+ if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+ if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+#else
+ assert(bit_depth == 8);
+ (void)bit_depth;
+#endif
+
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ ssim = _ctx->level[_l].ssim;
+ gx_buf = _ctx->col_buf;
+ stride = w + 8;
+ gy_buf = gx_buf + 8 * stride;
+ memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+ c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+ for (j = 0; j < h + 4; j++) {
+ if (j < h - 1) {
+ for (i = 0; i < w - 1; i++) {
+ unsigned g1;
+ unsigned g2;
+ unsigned gx;
+ unsigned gy;
+ g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+ g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+ gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+ g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
+ gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ gx_buf[(j & 7) * stride + i + 4] = gx;
+ gy_buf[(j & 7) * stride + i + 4] = gy;
+ }
+ } else {
+ memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+ memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+ }
+ if (j >= 4) {
+ int k;
+ col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+ col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+ col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+ col_sums_gxgy[0] = 0;
+ for (i = 4; i < 8; i++) {
+ FS_COL_SET(i, -1, 0);
+ FS_COL_ADD(i, 0, 0);
+ for (k = 1; k < 8 - i; k++) {
+ FS_COL_DOUBLE(i, i);
+ FS_COL_ADD(i, -k - 1, 0);
+ FS_COL_ADD(i, k, 0);
+ }
+ }
+ for (i = 0; i < w; i++) {
+ double mugx2;
+ double mugy2;
+ double mugxgy;
+ mugx2 = col_sums_gx2[0];
+ for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+ mugy2 = col_sums_gy2[0];
+ for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+ mugxgy = col_sums_gxgy[0];
+ for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+ ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+ if (i + 1 < w) {
+ FS_COL_SET(0, -1, 1);
+ FS_COL_ADD(0, 0, 1);
+ FS_COL_SUB(2, -3, 2);
+ FS_COL_SUB(2, 2, 2);
+ FS_COL_HALVE(1, 2);
+ FS_COL_SUB(3, -4, 3);
+ FS_COL_SUB(3, 3, 3);
+ FS_COL_HALVE(2, 3);
+ FS_COL_COPY(3, 4);
+ FS_COL_DOUBLE(4, 5);
+ FS_COL_ADD(4, -4, 5);
+ FS_COL_ADD(4, 3, 5);
+ FS_COL_DOUBLE(5, 6);
+ FS_COL_ADD(5, -3, 6);
+ FS_COL_ADD(5, 2, 6);
+ FS_COL_DOUBLE(6, 7);
+ FS_COL_ADD(6, -2, 7);
+ FS_COL_ADD(6, 1, 7);
+ FS_COL_SET(7, -1, 8);
+ FS_COL_ADD(7, 0, 8);
+ }
+ }
+ }
+ }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+ 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+ double *ssim;
+ double ret;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ ssim = _ctx->level[_l].ssim;
+ ret = 0;
+ for (j = 0; j < h; j++)
+ for (i = 0; i < w; i++) ret += ssim[j * w + i];
+ return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+ assert(_weight >= _ssim);
+ if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+ return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+ int _dystride, int _w, int _h, uint32_t _bd,
+ uint32_t _shift) {
+ fs_ctx ctx;
+ double ret;
+ int l;
+ ret = 1;
+ fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+ fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
+ _shift);
+ for (l = 0; l < FS_NLEVELS - 1; l++) {
+ fs_calc_structure(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_downsample_level(&ctx, l + 1);
+ }
+ fs_calc_structure(&ctx, l, _bd);
+ fs_apply_luminance(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_ctx_clear(&ctx);
+ return ret;
+}
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd) {
+ double ssimv;
+ uint32_t bd_shift = 0;
+ aom_clear_system_state();
+ assert(bd >= in_bd);
+
+ bd_shift = bd - in_bd;
+
+ *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, bd_shift);
+ *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift);
+ *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift);
+ ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+ return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
new file mode 100644
index 000000000..12ee02ba1
--- /dev/null
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/fwd_txfm.h"
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+
+void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[4 * 4];
+ const tran_low_t *in_low = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t in_high[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 4; ++i) {
+ // Load inputs.
+ if (pass == 0) {
+ in_high[0] = input[0 * stride] * 16;
+ in_high[1] = input[1 * stride] * 16;
+ in_high[2] = input[2 * stride] * 16;
+ in_high[3] = input[3 * stride] * 16;
+ if (i == 0 && in_high[0]) {
+ ++in_high[0];
+ }
+ } else {
+ assert(in_low != NULL);
+ in_high[0] = in_low[0 * 4];
+ in_high[1] = in_low[1 * 4];
+ in_high[2] = in_low[2 * 4];
+ in_high[3] = in_low[3 * 4];
+ ++in_low;
+ }
+ // Transform.
+ step[0] = in_high[0] + in_high[3];
+ step[1] = in_high[1] + in_high[2];
+ step[2] = in_high[1] - in_high[2];
+ step[3] = in_high[0] - in_high[3];
+ temp1 = (step[0] + step[1]) * cospi_16_64;
+ temp2 = (step[0] - step[1]) * cospi_16_64;
+ out[0] = (tran_low_t)fdct_round_shift(temp1);
+ out[2] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[3] = (tran_low_t)fdct_round_shift(temp2);
+ // Do next column (which is a transposed row in second/horizontal pass)
+ ++input;
+ out += 4;
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+
+ {
+ int i, j;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ }
+ }
+}
+
+void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 4; ++r)
+ for (c = 0; c < 4; ++c) sum += input[r * stride + c];
+
+ output[0] = sum << 1;
+}
+
+void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+ int i, j;
+ tran_low_t intermediate[64];
+ int pass;
+ tran_low_t *output = intermediate;
+ const tran_low_t *in = NULL;
+
+ // Transform columns
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ if (pass == 0) {
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+ ++input;
+ } else {
+ s0 = in[0 * 8] + in[7 * 8];
+ s1 = in[1 * 8] + in[6 * 8];
+ s2 = in[2 * 8] + in[5 * 8];
+ s3 = in[3 * 8] + in[4 * 8];
+ s4 = in[3 * 8] - in[4 * 8];
+ s5 = in[2 * 8] - in[5 * 8];
+ s6 = in[1 * 8] - in[6 * 8];
+ s7 = in[0 * 8] - in[7 * 8];
+ ++in;
+ }
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0] = (tran_low_t)fdct_round_shift(t0);
+ output[2] = (tran_low_t)fdct_round_shift(t2);
+ output[4] = (tran_low_t)fdct_round_shift(t1);
+ output[6] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1] = (tran_low_t)fdct_round_shift(t0);
+ output[3] = (tran_low_t)fdct_round_shift(t2);
+ output[5] = (tran_low_t)fdct_round_shift(t1);
+ output[7] = (tran_low_t)fdct_round_shift(t3);
+ output += 8;
+ }
+ in = intermediate;
+ output = final_output;
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+ }
+}
+
+void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 8; ++r)
+ for (c = 0; c < 8; ++c) sum += input[r * stride + c];
+
+ output[0] = sum;
+}
+
+void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[256];
+ const tran_low_t *in_low = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t in_high[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 16; i++) {
+ if (0 == pass) {
+ // Calculate input for the first 8 results.
+ in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
+ in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
+ in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
+ in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
+ in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
+ in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
+ in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
+ in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
+ // Calculate input for the next 8 results.
+ step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
+ step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
+ step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
+ step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
+ step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
+ step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
+ step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
+ step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
+ } else {
+ // Calculate input for the first 8 results.
+ assert(in_low != NULL);
+ in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
+ in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
+ in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
+ in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
+ in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
+ in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
+ in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
+ in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
+ // Calculate input for the next 8 results.
+ step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
+ step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
+ step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
+ step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
+ step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
+ step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
+ step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
+ step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
+ in_low++;
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ // stage 1
+ s0 = in_high[0] + in_high[7];
+ s1 = in_high[1] + in_high[6];
+ s2 = in_high[2] + in_high[5];
+ s3 = in_high[3] + in_high[4];
+ s4 = in_high[3] - in_high[4];
+ s5 = in_high[2] - in_high[5];
+ s6 = in_high[1] - in_high[6];
+ s7 = in_high[0] - in_high[7];
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = fdct_round_shift(temp1);
+ step2[3] = fdct_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = fdct_round_shift(temp1);
+ step2[5] = fdct_round_shift(temp2);
+ // step 3
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
+ // step 4
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
+ step2[1] = fdct_round_shift(temp1);
+ step2[2] = fdct_round_shift(temp2);
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = fdct_round_shift(temp1);
+ step2[6] = fdct_round_shift(temp2);
+ // step 5
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
+ // step 6
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
+ }
+ // Do next column (which is a transposed row in second/horizontal pass)
+ input++;
+ out += 16;
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+}
+
+void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ int sum = 0;
+ for (r = 0; r < 16; ++r)
+ for (c = 0; c < 16; ++c) sum += input[r * stride + c];
+
+ output[0] = (tran_low_t)(sum >> 1);
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+ // and make the bounds consts.
+ // assert(-131072 <= rv && rv <= 131071);
+ return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+ tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+ return rv;
+}
+
+void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+ tran_high_t step[32];
+ // Stage 1
+ step[0] = input[0] + input[(32 - 1)];
+ step[1] = input[1] + input[(32 - 2)];
+ step[2] = input[2] + input[(32 - 3)];
+ step[3] = input[3] + input[(32 - 4)];
+ step[4] = input[4] + input[(32 - 5)];
+ step[5] = input[5] + input[(32 - 6)];
+ step[6] = input[6] + input[(32 - 7)];
+ step[7] = input[7] + input[(32 - 8)];
+ step[8] = input[8] + input[(32 - 9)];
+ step[9] = input[9] + input[(32 - 10)];
+ step[10] = input[10] + input[(32 - 11)];
+ step[11] = input[11] + input[(32 - 12)];
+ step[12] = input[12] + input[(32 - 13)];
+ step[13] = input[13] + input[(32 - 14)];
+ step[14] = input[14] + input[(32 - 15)];
+ step[15] = input[15] + input[(32 - 16)];
+ step[16] = -input[16] + input[(32 - 17)];
+ step[17] = -input[17] + input[(32 - 18)];
+ step[18] = -input[18] + input[(32 - 19)];
+ step[19] = -input[19] + input[(32 - 20)];
+ step[20] = -input[20] + input[(32 - 21)];
+ step[21] = -input[21] + input[(32 - 22)];
+ step[22] = -input[22] + input[(32 - 23)];
+ step[23] = -input[23] + input[(32 - 24)];
+ step[24] = -input[24] + input[(32 - 25)];
+ step[25] = -input[25] + input[(32 - 26)];
+ step[26] = -input[26] + input[(32 - 27)];
+ step[27] = -input[27] + input[(32 - 28)];
+ step[28] = -input[28] + input[(32 - 29)];
+ step[29] = -input[29] + input[(32 - 30)];
+ step[30] = -input[30] + input[(32 - 31)];
+ step[31] = -input[31] + input[(32 - 32)];
+
+ // Stage 2
+ output[0] = step[0] + step[16 - 1];
+ output[1] = step[1] + step[16 - 2];
+ output[2] = step[2] + step[16 - 3];
+ output[3] = step[3] + step[16 - 4];
+ output[4] = step[4] + step[16 - 5];
+ output[5] = step[5] + step[16 - 6];
+ output[6] = step[6] + step[16 - 7];
+ output[7] = step[7] + step[16 - 8];
+ output[8] = -step[8] + step[16 - 9];
+ output[9] = -step[9] + step[16 - 10];
+ output[10] = -step[10] + step[16 - 11];
+ output[11] = -step[11] + step[16 - 12];
+ output[12] = -step[12] + step[16 - 13];
+ output[13] = -step[13] + step[16 - 14];
+ output[14] = -step[14] + step[16 - 15];
+ output[15] = -step[15] + step[16 - 16];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = step[18];
+ output[19] = step[19];
+
+ output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+ output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+ output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+ output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+ output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+ output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+ output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+ output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+ output[28] = step[28];
+ output[29] = step[29];
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
+ // Stage 3
+ step[0] = output[0] + output[(8 - 1)];
+ step[1] = output[1] + output[(8 - 2)];
+ step[2] = output[2] + output[(8 - 3)];
+ step[3] = output[3] + output[(8 - 4)];
+ step[4] = -output[4] + output[(8 - 5)];
+ step[5] = -output[5] + output[(8 - 6)];
+ step[6] = -output[6] + output[(8 - 7)];
+ step[7] = -output[7] + output[(8 - 8)];
+ step[8] = output[8];
+ step[9] = output[9];
+ step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+ step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+ step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+ step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+ step[14] = output[14];
+ step[15] = output[15];
+
+ step[16] = output[16] + output[23];
+ step[17] = output[17] + output[22];
+ step[18] = output[18] + output[21];
+ step[19] = output[19] + output[20];
+ step[20] = -output[20] + output[19];
+ step[21] = -output[21] + output[18];
+ step[22] = -output[22] + output[17];
+ step[23] = -output[23] + output[16];
+ step[24] = -output[24] + output[31];
+ step[25] = -output[25] + output[30];
+ step[26] = -output[26] + output[29];
+ step[27] = -output[27] + output[28];
+ step[28] = output[28] + output[27];
+ step[29] = output[29] + output[26];
+ step[30] = output[30] + output[25];
+ step[31] = output[31] + output[24];
+
+ // Stage 4
+ output[0] = step[0] + step[3];
+ output[1] = step[1] + step[2];
+ output[2] = -step[2] + step[1];
+ output[3] = -step[3] + step[0];
+ output[4] = step[4];
+ output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+ output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+ output[7] = step[7];
+ output[8] = step[8] + step[11];
+ output[9] = step[9] + step[10];
+ output[10] = -step[10] + step[9];
+ output[11] = -step[11] + step[8];
+ output[12] = -step[12] + step[15];
+ output[13] = -step[13] + step[14];
+ output[14] = step[14] + step[13];
+ output[15] = step[15] + step[12];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+ output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+ output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+ output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+ output[22] = step[22];
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = step[25];
+ output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+ output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+ output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+ output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // Stage 5
+ step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+ step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+ step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+ step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+ step[4] = output[4] + output[5];
+ step[5] = -output[5] + output[4];
+ step[6] = -output[6] + output[7];
+ step[7] = output[7] + output[6];
+ step[8] = output[8];
+ step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+ step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+ step[11] = output[11];
+ step[12] = output[12];
+ step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+ step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+ step[15] = output[15];
+
+ step[16] = output[16] + output[19];
+ step[17] = output[17] + output[18];
+ step[18] = -output[18] + output[17];
+ step[19] = -output[19] + output[16];
+ step[20] = -output[20] + output[23];
+ step[21] = -output[21] + output[22];
+ step[22] = output[22] + output[21];
+ step[23] = output[23] + output[20];
+ step[24] = output[24] + output[27];
+ step[25] = output[25] + output[26];
+ step[26] = -output[26] + output[25];
+ step[27] = -output[27] + output[24];
+ step[28] = -output[28] + output[31];
+ step[29] = -output[29] + output[30];
+ step[30] = output[30] + output[29];
+ step[31] = output[31] + output[28];
+
+ // Stage 6
+ output[0] = step[0];
+ output[1] = step[1];
+ output[2] = step[2];
+ output[3] = step[3];
+ output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+ output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+ output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+ output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+ output[8] = step[8] + step[9];
+ output[9] = -step[9] + step[8];
+ output[10] = -step[10] + step[11];
+ output[11] = step[11] + step[10];
+ output[12] = step[12] + step[13];
+ output[13] = -step[13] + step[12];
+ output[14] = -step[14] + step[15];
+ output[15] = step[15] + step[14];
+
+ output[16] = step[16];
+ output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+ output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+ output[19] = step[19];
+ output[20] = step[20];
+ output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+ output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+ output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+ output[27] = step[27];
+ output[28] = step[28];
+ output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+ output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+ output[31] = step[31];
+
+ // Stage 7
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ step[4] = output[4];
+ step[5] = output[5];
+ step[6] = output[6];
+ step[7] = output[7];
+ step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+ step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+ step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+ step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+ step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+ step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+ step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+ step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+ step[16] = output[16] + output[17];
+ step[17] = -output[17] + output[16];
+ step[18] = -output[18] + output[19];
+ step[19] = output[19] + output[18];
+ step[20] = output[20] + output[21];
+ step[21] = -output[21] + output[20];
+ step[22] = -output[22] + output[23];
+ step[23] = output[23] + output[22];
+ step[24] = output[24] + output[25];
+ step[25] = -output[25] + output[24];
+ step[26] = -output[26] + output[27];
+ step[27] = output[27] + output[26];
+ step[28] = output[28] + output[29];
+ step[29] = -output[29] + output[28];
+ step[30] = -output[30] + output[31];
+ step[31] = output[31] + output[30];
+
+ // Final stage --- outputs indices are bit-reversed.
+ output[0] = step[0];
+ output[16] = step[1];
+ output[8] = step[2];
+ output[24] = step[3];
+ output[4] = step[4];
+ output[20] = step[5];
+ output[12] = step[6];
+ output[28] = step[7];
+ output[2] = step[8];
+ output[18] = step[9];
+ output[10] = step[10];
+ output[26] = step[11];
+ output[6] = step[12];
+ output[22] = step[13];
+ output[14] = step[14];
+ output[30] = step[15];
+
+ output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+ output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+ output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+ output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+ output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+ output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+ output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+ output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+ output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+ output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+ output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+ output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+ output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+ output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+ output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+ output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+ int i, j;
+ tran_high_t output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ aom_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+ aom_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+ int i, j;
+ tran_high_t output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ aom_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ // TODO(cd): see quality impact of only doing
+ // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+ // PS: also change code in aom_dsp/x86/aom_dct_sse2.c
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+ aom_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+
+void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ int sum = 0;
+ for (r = 0; r < 32; ++r)
+ for (c = 0; c < 32; ++c) sum += input[r * stride + c];
+
+ output[0] = (tran_low_t)(sum >> 3);
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ aom_fdct4x4_c(input, output, stride);
+}
+
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ aom_fdct8x8_c(input, final_output, stride);
+}
+
+void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ aom_fdct8x8_1_c(input, final_output, stride);
+}
+
+void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ aom_fdct16x16_c(input, output, stride);
+}
+
+void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ aom_fdct16x16_1_c(input, output, stride);
+}
+
+void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+ aom_fdct32x32_c(input, out, stride);
+}
+
+void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ aom_fdct32x32_rd_c(input, out, stride);
+}
+
+void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ aom_fdct32x32_1_c(input, out, stride);
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h
new file mode 100644
index 000000000..579dbd06e
--- /dev/null
+++ b/third_party/aom/aom_dsp/fwd_txfm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FWD_TXFM_H_
+#define AOM_DSP_FWD_TXFM_H_
+
+#include "aom_dsp/txfm_common.h"
+
+static INLINE tran_high_t saturate_int16(tran_high_t value) {
+ tran_high_t result;
+ result = value > INT16_MAX ? INT16_MAX : value;
+ return result < INT16_MIN ? INT16_MIN : result;
+}
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return rv;
+}
+
+void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif // AOM_DSP_FWD_TXFM_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
new file mode 100644
index 000000000..1f0870b64
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -0,0 +1,971 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+
+#define DST(x, y) dst[(x) + (y)*stride]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ (void)above;
+
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+ left[(c >> 1) + r + 2])
+ : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ (void)left;
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+ above[(r >> 1) + c + 2])
+ : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ (void)left;
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = AVG3(above[r + c], above[r + c + 1],
+ above[r + c + 1 + (r + c + 2 < bs * 2)]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+
+ // first row
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
+ dst += stride;
+
+ // second row
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ dst += stride;
+
+ // the rest of first col
+ dst[0] = AVG3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+ // the rest of the block
+ for (r = 2; r < bs; ++r) {
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
+ dst += stride;
+ }
+}
+
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+#if CONFIG_TX64X64
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+ // silence a spurious -Warray-bounds warning, possibly related to:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+ uint8_t border[133];
+#else
+ uint8_t border[64 + 64 - 1]; // outer border from bottom-left to top-right
+#endif
+#else
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+ // silence a spurious -Warray-bounds warning, possibly related to:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+ uint8_t border[69];
+#else
+ uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right
+#endif
+#endif // CONFIG_TX64X64
+
+ // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+ for (i = 0; i < bs - 2; ++i) {
+ border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+ }
+ border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+ border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+ border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+ // dst[0][2, size), i.e., remaining top border ascending
+ for (i = 0; i < bs - 2; ++i) {
+ border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+ }
+
+ for (i = 0; i < bs; ++i) {
+ memcpy(dst + i * stride, border + bs - 1 - i, bs);
+ }
+}
+
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ dst[0] = AVG2(above[-1], left[0]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
+ dst++;
+
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ dst[stride] = AVG3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+ dst += stride;
+
+ for (r = 1; r < bs; ++r) {
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
+ dst += stride;
+ }
+}
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ memcpy(dst, above, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)above;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, left[r], bs);
+ dst += stride;
+ }
+}
+
+#if CONFIG_ALT_INTRA
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+ uint16_t top_left) {
+ const int base = top + left - top_left;
+ const int p_left = abs_diff(base, left);
+ const int p_top = abs_diff(base, top);
+ const int p_top_left = abs_diff(base, top_left);
+
+ // Return nearest to base of left, top and top_left.
+ return (p_left <= p_top && p_left <= p_top_left)
+ ? left
+ : (p_top <= p_top_left) ? top : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ const uint8_t ytop_left = above[-1];
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+ dst += stride;
+ }
+}
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^sm_weight_log2_scale.
+static const int sm_weight_log2_scale = 8;
+
+#if CONFIG_TX64X64
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+#define NUM_BLOCK_DIMS 6 // log2(MAX_BLOCK_DIM)
+#else
+#define MAX_BLOCK_DIM 32
+#define NUM_BLOCK_DIMS 5
+#endif // CONFIG_TX64X64
+
+static const uint8_t sm_weight_arrays[NUM_BLOCK_DIMS][MAX_BLOCK_DIM] = {
+ // bs = 2
+ { 255, 128 },
+ // bs = 4
+ { 255, 149, 85, 64 },
+ // bs = 8
+ { 255, 197, 146, 105, 73, 50, 37, 32 },
+ // bs = 16
+ { 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16 },
+ // bs = 32
+ {
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122,
+ 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34,
+ 29, 25, 21, 17, 14, 12, 10, 9, 8, 8 },
+#if CONFIG_TX64X64
+ // bs = 64
+ { 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169,
+ 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96,
+ 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44,
+ 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13,
+ 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 },
+#endif // CONFIG_TX64X64
+};
+
+// Some basic checks on weights for smooth predictor.
+#define sm_weights_sanity_checks(weights, weights_scale, pred_scale) \
+ assert(weights[0] < weights_scale); \
+ assert(weights_scale - weights[bs - 1] < weights_scale); \
+ assert(pred_scale < 31) // ensures no overflow when calculating predictor.
+
+#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
+
+static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel
+ const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel
+ const int arr_index = get_msb(bs) - 1;
+ assert(arr_index >= 0);
+ assert(arr_index < NUM_BLOCK_DIMS);
+ const uint8_t *const sm_weights = sm_weight_arrays[arr_index];
+ // scale = 2 * 2^sm_weight_log2_scale
+ const int log2_scale = 1 + sm_weight_log2_scale;
+ const uint16_t scale = (1 << sm_weight_log2_scale);
+ sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst));
+ int r;
+ for (r = 0; r < bs; ++r) {
+ int c;
+ for (c = 0; c < bs; ++c) {
+ const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
+ const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r],
+ sm_weights[c], scale - sm_weights[c] };
+ uint32_t this_pred = 0;
+ int i;
+ assert(scale >= sm_weights[r] && scale >= sm_weights[c]);
+ for (i = 0; i < 4; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+ }
+ dst += stride;
+ }
+}
+
+#else
+
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ int ytop_left = above[-1];
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+ dst += stride;
+ }
+}
+#endif // CONFIG_ALT_INTRA
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, 128, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+
+ for (i = 0; i < bs; i++) sum += left[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+
+ for (i = 0; i < bs; i++) sum += above[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ const int count = 2 * bs;
+
+ for (i = 0; i < bs; i++) {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)stride;
+ (void)left;
+
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(1, 1) = AVG3(C, D, D);
+}
+
+void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ DST(0, 0) = AVG2(X, A);
+ DST(1, 0) = AVG2(A, B);
+ DST(0, 1) = AVG3(I, X, A);
+ DST(1, 1) = AVG3(X, A, B);
+}
+
+void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ (void)stride;
+ DST(0, 1) = AVG3(X, I, J);
+ DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(1, 0) = AVG3(B, A, X);
+}
+
+void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int X = above[-1];
+ const int A = above[0];
+
+ DST(0, 0) = AVG2(I, X);
+ DST(0, 1) = AVG2(J, I);
+ DST(1, 0) = AVG3(I, X, A);
+ DST(1, 1) = AVG3(J, I, X);
+}
+
+void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ const int H = above[7];
+ (void)stride;
+ (void)left;
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(G, H, H);
+}
+
+void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ DST(0, 0) = DST(1, 2) = AVG2(X, A);
+ DST(1, 0) = DST(2, 2) = AVG2(A, B);
+ DST(2, 0) = DST(3, 2) = AVG2(B, C);
+ DST(3, 0) = AVG2(C, D);
+
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+ DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+ DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+ DST(3, 1) = AVG3(B, C, D);
+}
+
+void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)stride;
+ DST(0, 3) = AVG3(J, K, L);
+ DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+ DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+ DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+ DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
+}
+
+void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)above;
+ (void)bd;
+
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+ left[(c >> 1) + r + 2])
+ : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)left;
+ (void)bd;
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+ above[(r >> 1) + c + 2])
+ : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)left;
+ (void)bd;
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ dst[c] = AVG3(above[r + c], above[r + c + 1],
+ above[r + c + 1 + (r + c + 2 < bs * 2)]);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)bd;
+
+ // first row
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
+ dst += stride;
+
+ // second row
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ dst += stride;
+
+ // the rest of first col
+ dst[0] = AVG3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+ // the rest of the block
+ for (r = 2; r < bs; ++r) {
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)bd;
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+
+ dst[stride] = AVG3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; ++r)
+ dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+
+ dst += stride;
+ for (r = 1; r < bs; ++r) {
+ for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)bd;
+ dst[0] = AVG2(above[-1], left[0]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
+ dst++;
+
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ dst[stride] = AVG3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+ dst += stride;
+
+ for (r = 1; r < bs; ++r) {
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)left;
+ (void)bd;
+ for (r = 0; r < bs; r++) {
+ memcpy(dst, above, bs * sizeof(uint16_t));
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)bd;
+ for (r = 0; r < bs; r++) {
+ aom_memset16(dst, left[r], bs);
+ dst += stride;
+ }
+}
+
+void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ (void)above;
+ (void)bd;
+ DST(0, 0) = AVG2(I, J);
+ DST(0, 1) = AVG2(J, K);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(1, 1) = AVG3(J, K, L);
+}
+
+void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)left;
+ (void)bd;
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = AVG2(B, C);
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = AVG3(B, C, D);
+}
+
+void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)stride;
+ (void)left;
+ (void)bd;
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(1, 1) = AVG3(C, D, D);
+}
+
+void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ (void)bd;
+ DST(0, 0) = AVG2(X, A);
+ DST(1, 0) = AVG2(A, B);
+ DST(0, 1) = AVG3(I, X, A);
+ DST(1, 1) = AVG3(X, A, B);
+}
+
+void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ (void)bd;
+ DST(0, 1) = AVG3(X, I, J);
+ DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(1, 0) = AVG3(B, A, X);
+}
+
+void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int X = above[-1];
+ const int A = above[0];
+ (void)bd;
+ DST(0, 0) = AVG2(I, X);
+ DST(0, 1) = AVG2(J, I);
+ DST(1, 0) = AVG3(I, X, A);
+ DST(1, 1) = AVG3(J, I, X);
+}
+
+#if CONFIG_ALT_INTRA
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ const uint16_t ytop_left = above[-1];
+ (void)bd;
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel
+ const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel
+ const int arr_index = get_msb(bs) - 1;
+ assert(arr_index >= 0);
+ assert(arr_index < NUM_BLOCK_DIMS);
+ const uint8_t *const sm_weights = sm_weight_arrays[arr_index];
+ // scale = 2 * 2^sm_weight_log2_scale
+ const int log2_scale = 1 + sm_weight_log2_scale;
+ const uint16_t scale = (1 << sm_weight_log2_scale);
+ sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst));
+ int r;
+ for (r = 0; r < bs; ++r) {
+ int c;
+ for (c = 0; c < bs; ++c) {
+ const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
+ const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r],
+ sm_weights[c], scale - sm_weights[c] };
+ uint32_t this_pred = 0;
+ int i;
+ assert(scale >= sm_weights[r] && scale >= sm_weights[c]);
+ for (i = 0; i < 4; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+ }
+ dst += stride;
+ }
+}
+
+#else
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ int ytop_left = above[-1];
+ (void)bd;
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
+ dst += stride;
+ }
+}
+#endif // CONFIG_ALT_INTRA
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ aom_memset16(dst, 128 << (bd - 8), bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) sum += left[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ aom_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) sum += above[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ aom_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ const int count = 2 * bs;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ aom_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, size) \
+ void aom_##type##_predictor_##size##x##size##_c( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ type##_predictor(dst, stride, size, above, left); \
+ }
+
+#if CONFIG_HIGHBITDEPTH
+#define intra_pred_highbd_sized(type, size) \
+ void aom_highbd_##type##_predictor_##size##x##size##_c( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ highbd_##type##_predictor(dst, stride, size, above, left, bd); \
+ }
+
+/* clang-format off */
+#if CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_sized(type, 64) \
+ intra_pred_highbd_sized(type, 4) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32) \
+ intra_pred_highbd_sized(type, 64)
+
+#define intra_pred_above_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_sized(type, 64) \
+ intra_pred_highbd_sized(type, 4) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32) \
+ intra_pred_highbd_sized(type, 64)
+#else // CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_highbd_sized(type, 2) \
+ intra_pred_highbd_sized(type, 4) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32)
+
+#define intra_pred_above_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_highbd_sized(type, 4) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32)
+#endif // CONFIG_TX64X64
+
+#else
+
+#if CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_sized(type, 64)
+
+#define intra_pred_above_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_sized(type, 64)
+#else // CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32)
+
+#define intra_pred_above_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32)
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+
+intra_pred_allsizes(d207e)
+intra_pred_allsizes(d63e)
+intra_pred_above_4x4(d45e)
+intra_pred_above_4x4(d117)
+intra_pred_above_4x4(d135)
+intra_pred_above_4x4(d153)
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+#if CONFIG_ALT_INTRA
+intra_pred_allsizes(paeth)
+intra_pred_allsizes(smooth)
+#else
+intra_pred_allsizes(tm)
+#endif // CONFIG_ALT_INTRA
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_allsizes(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c
new file mode 100644
index 000000000..bb995856a
--- /dev/null
+++ b/third_party/aom/aom_dsp/inv_txfm.c
@@ -0,0 +1,1445 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/inv_txfm.h"
+
+void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1);
+ op[1] = WRAPLOW(b1);
+ op[2] = WRAPLOW(c1);
+ op[3] = WRAPLOW(d1);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
+
+ ip++;
+ dest++;
+ }
+}
+
+void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1);
+ op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+ dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+ dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+ dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+ ip++;
+ dest++;
+ }
+}
+
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3]);
+ output[1] = WRAPLOW(step[1] + step[2]);
+ output[2] = WRAPLOW(step[1] - step[2]);
+ output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ aom_idct4_c(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ aom_idct4_c(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 4));
+ }
+ }
+}
+
+void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ if (a1 == 0) return;
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_add(dest[0], a1);
+ dest[1] = clip_pixel_add(dest[1], a1);
+ dest[2] = clip_pixel_add(dest[2], a1);
+ dest[3] = clip_pixel_add(dest[3], a1);
+ dest += dest_stride;
+ }
+}
+
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ temp1 = (step1[0] + step1[2]) * cospi_16_64;
+ temp2 = (step1[0] - step1[2]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ // stage 3
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7]);
+ output[1] = WRAPLOW(step1[1] + step1[6]);
+ output[2] = WRAPLOW(step1[2] + step1[5]);
+ output[3] = WRAPLOW(step1[3] + step1[4]);
+ output[4] = WRAPLOW(step1[3] - step1[4]);
+ output[5] = WRAPLOW(step1[2] - step1[5]);
+ output[6] = WRAPLOW(step1[1] - step1[6]);
+ output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i) {
+ aom_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ aom_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ if (a1 == 0) return;
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = WRAPLOW(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+ output[2] = WRAPLOW(dct_const_round_shift(s2));
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
+}
+
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+ // stage 3
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x4);
+ output[2] = WRAPLOW(x6);
+ output[3] = WRAPLOW(-x2);
+ output[4] = WRAPLOW(x3);
+ output[5] = WRAPLOW(-x7);
+ output[6] = WRAPLOW(x5);
+ output[7] = WRAPLOW(-x1);
+}
+
+void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ aom_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ aom_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0 / 2];
+ step1[1] = input[16 / 2];
+ step1[2] = input[8 / 2];
+ step1[3] = input[24 / 2];
+ step1[4] = input[4 / 2];
+ step1[5] = input[20 / 2];
+ step1[6] = input[12 / 2];
+ step1[7] = input[28 / 2];
+ step1[8] = input[2 / 2];
+ step1[9] = input[18 / 2];
+ step1[10] = input[10 / 2];
+ step1[11] = input[26 / 2];
+ step1[12] = input[6 / 2];
+ step1[13] = input[22 / 2];
+ step1[14] = input[14 / 2];
+ step1[15] = input[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15]);
+ output[1] = WRAPLOW(step2[1] + step2[14]);
+ output[2] = WRAPLOW(step2[2] + step2[13]);
+ output[3] = WRAPLOW(step2[3] + step2[12]);
+ output[4] = WRAPLOW(step2[4] + step2[11]);
+ output[5] = WRAPLOW(step2[5] + step2[10]);
+ output[6] = WRAPLOW(step2[6] + step2[9]);
+ output[7] = WRAPLOW(step2[7] + step2[8]);
+ output[8] = WRAPLOW(step2[7] - step2[8]);
+ output[9] = WRAPLOW(step2[6] - step2[9]);
+ output[10] = WRAPLOW(step2[5] - step2[10]);
+ output[11] = WRAPLOW(step2[4] - step2[11]);
+ output[12] = WRAPLOW(step2[3] - step2[12]);
+ output[13] = WRAPLOW(step2[2] - step2[13]);
+ output[14] = WRAPLOW(step2[1] - step2[14]);
+ output[15] = WRAPLOW(step2[0] - step2[15]);
+}
+
+void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ aom_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ aom_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = output[8] = output[9] = output[10] =
+ output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4);
+ x1 = WRAPLOW(s1 + s5);
+ x2 = WRAPLOW(s2 + s6);
+ x3 = WRAPLOW(s3 + s7);
+ x4 = WRAPLOW(s0 - s4);
+ x5 = WRAPLOW(s1 - s5);
+ x6 = WRAPLOW(s2 - s6);
+ x7 = WRAPLOW(s3 - s7);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+ x8 = WRAPLOW(s8 + s10);
+ x9 = WRAPLOW(s9 + s11);
+ x10 = WRAPLOW(s8 - s10);
+ x11 = WRAPLOW(s9 - s11);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+ x10 = WRAPLOW(dct_const_round_shift(s10));
+ x11 = WRAPLOW(dct_const_round_shift(s11));
+ x14 = WRAPLOW(dct_const_round_shift(s14));
+ x15 = WRAPLOW(dct_const_round_shift(s15));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x8);
+ output[2] = WRAPLOW(x12);
+ output[3] = WRAPLOW(-x4);
+ output[4] = WRAPLOW(x6);
+ output[5] = WRAPLOW(x14);
+ output[6] = WRAPLOW(x10);
+ output[7] = WRAPLOW(x2);
+ output[8] = WRAPLOW(x3);
+ output[9] = WRAPLOW(x11);
+ output[10] = WRAPLOW(x15);
+ output[11] = WRAPLOW(x7);
+ output[12] = WRAPLOW(x5);
+ output[13] = WRAPLOW(-x13);
+ output[14] = WRAPLOW(x9);
+ output[15] = WRAPLOW(-x1);
+}
+
+void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ aom_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ aom_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ aom_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ aom_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ if (a1 == 0) return;
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step2[16] = WRAPLOW(step1[16] + step1[17]);
+ step2[17] = WRAPLOW(step1[16] - step1[17]);
+ step2[18] = WRAPLOW(-step1[18] + step1[19]);
+ step2[19] = WRAPLOW(step1[18] + step1[19]);
+ step2[20] = WRAPLOW(step1[20] + step1[21]);
+ step2[21] = WRAPLOW(step1[20] - step1[21]);
+ step2[22] = WRAPLOW(-step1[22] + step1[23]);
+ step2[23] = WRAPLOW(step1[22] + step1[23]);
+ step2[24] = WRAPLOW(step1[24] + step1[25]);
+ step2[25] = WRAPLOW(step1[24] - step1[25]);
+ step2[26] = WRAPLOW(-step1[26] + step1[27]);
+ step2[27] = WRAPLOW(step1[26] + step1[27]);
+ step2[28] = WRAPLOW(step1[28] + step1[29]);
+ step2[29] = WRAPLOW(step1[28] - step1[29]);
+ step2[30] = WRAPLOW(-step1[30] + step1[31]);
+ step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19]);
+ step2[17] = WRAPLOW(step1[17] + step1[18]);
+ step2[18] = WRAPLOW(step1[17] - step1[18]);
+ step2[19] = WRAPLOW(step1[16] - step1[19]);
+ step2[20] = WRAPLOW(-step1[20] + step1[23]);
+ step2[21] = WRAPLOW(-step1[21] + step1[22]);
+ step2[22] = WRAPLOW(step1[21] + step1[22]);
+ step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27]);
+ step2[25] = WRAPLOW(step1[25] + step1[26]);
+ step2[26] = WRAPLOW(step1[25] - step1[26]);
+ step2[27] = WRAPLOW(step1[24] - step1[27]);
+ step2[28] = WRAPLOW(-step1[28] + step1[31]);
+ step2[29] = WRAPLOW(-step1[29] + step1[30]);
+ step2[30] = WRAPLOW(step1[29] + step1[30]);
+ step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23]);
+ step2[17] = WRAPLOW(step1[17] + step1[22]);
+ step2[18] = WRAPLOW(step1[18] + step1[21]);
+ step2[19] = WRAPLOW(step1[19] + step1[20]);
+ step2[20] = WRAPLOW(step1[19] - step1[20]);
+ step2[21] = WRAPLOW(step1[18] - step1[21]);
+ step2[22] = WRAPLOW(step1[17] - step1[22]);
+ step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31]);
+ step2[25] = WRAPLOW(-step1[25] + step1[30]);
+ step2[26] = WRAPLOW(-step1[26] + step1[29]);
+ step2[27] = WRAPLOW(-step1[27] + step1[28]);
+ step2[28] = WRAPLOW(step1[27] + step1[28]);
+ step2[29] = WRAPLOW(step1[26] + step1[29]);
+ step2[30] = WRAPLOW(step1[25] + step1[30]);
+ step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15]);
+ step1[1] = WRAPLOW(step2[1] + step2[14]);
+ step1[2] = WRAPLOW(step2[2] + step2[13]);
+ step1[3] = WRAPLOW(step2[3] + step2[12]);
+ step1[4] = WRAPLOW(step2[4] + step2[11]);
+ step1[5] = WRAPLOW(step2[5] + step2[10]);
+ step1[6] = WRAPLOW(step2[6] + step2[9]);
+ step1[7] = WRAPLOW(step2[7] + step2[8]);
+ step1[8] = WRAPLOW(step2[7] - step2[8]);
+ step1[9] = WRAPLOW(step2[6] - step2[9]);
+ step1[10] = WRAPLOW(step2[5] - step2[10]);
+ step1[11] = WRAPLOW(step2[4] - step2[11]);
+ step1[12] = WRAPLOW(step2[3] - step2[12]);
+ step1[13] = WRAPLOW(step2[2] - step2[13]);
+ step1[14] = WRAPLOW(step2[1] - step2[14]);
+ step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31]);
+ output[1] = WRAPLOW(step1[1] + step1[30]);
+ output[2] = WRAPLOW(step1[2] + step1[29]);
+ output[3] = WRAPLOW(step1[3] + step1[28]);
+ output[4] = WRAPLOW(step1[4] + step1[27]);
+ output[5] = WRAPLOW(step1[5] + step1[26]);
+ output[6] = WRAPLOW(step1[6] + step1[25]);
+ output[7] = WRAPLOW(step1[7] + step1[24]);
+ output[8] = WRAPLOW(step1[8] + step1[23]);
+ output[9] = WRAPLOW(step1[9] + step1[22]);
+ output[10] = WRAPLOW(step1[10] + step1[21]);
+ output[11] = WRAPLOW(step1[11] + step1[20]);
+ output[12] = WRAPLOW(step1[12] + step1[19]);
+ output[13] = WRAPLOW(step1[13] + step1[18]);
+ output[14] = WRAPLOW(step1[14] + step1[17]);
+ output[15] = WRAPLOW(step1[15] + step1[16]);
+ output[16] = WRAPLOW(step1[15] - step1[16]);
+ output[17] = WRAPLOW(step1[14] - step1[17]);
+ output[18] = WRAPLOW(step1[13] - step1[18]);
+ output[19] = WRAPLOW(step1[12] - step1[19]);
+ output[20] = WRAPLOW(step1[11] - step1[20]);
+ output[21] = WRAPLOW(step1[10] - step1[21]);
+ output[22] = WRAPLOW(step1[9] - step1[22]);
+ output[23] = WRAPLOW(step1[8] - step1[23]);
+ output[24] = WRAPLOW(step1[7] - step1[24]);
+ output[25] = WRAPLOW(step1[6] - step1[25]);
+ output[26] = WRAPLOW(step1[5] - step1[26]);
+ output[27] = WRAPLOW(step1[4] - step1[27]);
+ output[28] = WRAPLOW(step1[3] - step1[28]);
+ output[29] = WRAPLOW(step1[2] - step1[29]);
+ output[30] = WRAPLOW(step1[1] - step1[30]);
+ output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int16_t zero_coeff[16];
+ for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ aom_idct32_c(input, outptr);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ aom_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 16x16 has non-zero coeff
+ for (i = 0; i < 16; ++i) {
+ aom_idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ aom_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ aom_idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ aom_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ if (a1 == 0) return;
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = HIGHBD_WRAPLOW(a1, bd);
+ op[1] = HIGHBD_WRAPLOW(b1, bd);
+ op[2] = HIGHBD_WRAPLOW(c1, bd);
+ op[3] = HIGHBD_WRAPLOW(d1, bd);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] =
+ highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
+ dest[stride * 1] =
+ highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
+ dest[stride * 2] =
+ highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
+ dest[stride * 3] =
+ highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void)bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = HIGHBD_WRAPLOW(a1, bd);
+ op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] =
+ highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] =
+ highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] =
+ highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] =
+ highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void)bd;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ // stage 2
+ output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+ output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+ output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+ output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
+}
+
+void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ aom_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ aom_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+}
+
+void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+ dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+ dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+ dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+ dest += dest_stride;
+ }
+}
+
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h
new file mode 100644
index 000000000..e64d463ea
--- /dev/null
+++ b/third_party/aom/aom_dsp/inv_txfm.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_INV_TXFM_H_
+#define AOM_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return rv;
+}
+
+static INLINE tran_high_t check_range(tran_high_t input, int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid AV1 input streams, intermediate stage coefficients should always
+ // stay within the range of a signed 16 bit integer. Coefficients can go out
+ // of this range for invalid/corrupt AV1 streams. However, strictly checking
+ // this range for every intermediate coefficient can burdensome for a decoder,
+ // therefore the following assertion is only enabled when configured with
+ // --enable-coefficient-range-checking.
+ // For valid highbitdepth AV1 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void)int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void)bd;
+ return input;
+}
+
+#define WRAPLOW(x) ((int32_t)check_range(x, 8))
+#if CONFIG_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd))
+#endif // CONFIG_HIGHBITDEPTH
+
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ trans = HIGHBD_WRAPLOW(trans, bd);
+ return clip_pixel_highbd(dest + (int)trans, bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+ trans = WRAPLOW(trans);
+ return clip_pixel(dest + (int)trans);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_INV_TXFM_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
new file mode 100644
index 000000000..e2e839219
--- /dev/null
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+ return (int8_t)clamp(t, -128, 127);
+}
+
+#define PARALLEL_DEBLOCKING_11_TAP 0
+#define PARALLEL_DEBLOCKING_9_TAP 0
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+ switch (bd) {
+ case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+ case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+ case 8:
+ default: return (int16_t)clamp(t, -128, 128 - 1);
+ }
+}
+#endif
+#if CONFIG_PARALLEL_DEBLOCKING
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+ uint8_t p0, uint8_t q0, uint8_t q1) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+#endif // CONFIG_PARALLEL_DEBLOCKING
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p3 - p2) > limit) * -1;
+ mask |= (abs(p2 - p1) > limit) * -1;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(q2 - q1) > limit) * -1;
+ mask |= (abs(q3 - q2) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+ uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+ uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > thresh) * -1;
+ mask |= (abs(q1 - q0) > thresh) * -1;
+ mask |= (abs(p2 - p0) > thresh) * -1;
+ mask |= (abs(q2 - q0) > thresh) * -1;
+ mask |= (abs(p3 - p0) > thresh) * -1;
+ mask |= (abs(q3 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+#if PARALLEL_DEBLOCKING_9_TAP
+static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0,
+ uint8_t q0, uint8_t q4) {
+ int8_t mask = 0;
+ mask |= (abs(p4 - p0) > thresh) * -1;
+ mask |= (abs(q4 - q0) > thresh) * -1;
+ return ~mask;
+}
+#endif
+
+#if PARALLEL_DEBLOCKING_11_TAP
+static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4,
+ uint8_t p0, uint8_t q0, uint8_t q4,
+ uint8_t q5) {
+ int8_t mask = 0;
+ mask |= (abs(p4 - p0) > thresh) * -1;
+ mask |= (abs(q4 - q0) > thresh) * -1;
+ mask |= (abs(p5 - p0) > thresh) * -1;
+ mask |= (abs(q5 - q0) > thresh) * -1;
+ return ~mask;
+}
+#endif
+
+static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3,
+ uint8_t q4) {
+ int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+ mask |= (abs(p4 - p0) > thresh) * -1;
+ mask |= (abs(q4 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1) {
+ int8_t hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+ int8_t filter1, filter2;
+
+ const int8_t ps1 = (int8_t)*op1 ^ 0x80;
+ const int8_t ps0 = (int8_t)*op0 ^ 0x80;
+ const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
+ const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
+ const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+ // add outer taps if we have high edge variance
+ int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+ // inner taps
+ filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+ // save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set to adjust by -1 to account for the fact
+ // we'd round 3 the other way
+ filter1 = signed_char_clamp(filter + 4) >> 3;
+ filter2 = signed_char_clamp(filter + 3) >> 3;
+
+ *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
+ *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+
+ // outer tap adjustments
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
+ *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+}
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else // CONFIG_PARALLEL_DEBLOCKING
+ const uint8_t p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+ const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else // CONFIG_PARALLEL_DEBLOCKING
+ const uint8_t p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1];
+ const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+ s += pitch;
+ }
+}
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t *op3, uint8_t *op2, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3) {
+ if (flat && mask) {
+ const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ filter4(mask, thresh, op1, op0, oq0, oq1);
+ }
+}
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+ s + 3);
+ s += pitch;
+ }
+}
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+#if PARALLEL_DEBLOCKING_11_TAP
+static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint8_t *op5, uint8_t *op4,
+ uint8_t *op3, uint8_t *op2, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3, uint8_t *oq4,
+ uint8_t *oq5) {
+ if (flat2 && flat && mask) {
+ const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1,
+ p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+ q5 = *oq5;
+
+ // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
+ *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12;
+ *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12;
+ *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12;
+ *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12;
+ *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12;
+ *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12;
+ *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12;
+ *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12;
+ *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12;
+ *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12;
+ } else {
+ filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ }
+}
+#endif
+
+#if PARALLEL_DEBLOCKING_9_TAP
+static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint8_t *op4, uint8_t *op3,
+ uint8_t *op2, uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+ uint8_t *oq3, uint8_t *oq4) {
+ if (flat2 && flat && mask) {
+ const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4;
+
+ // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1]
+ *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10;
+ *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10;
+ *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10;
+ *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10;
+ *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10;
+ *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10;
+ *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10;
+ *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10;
+ } else {
+ filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ }
+}
+#endif
+
+static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint8_t *op7, uint8_t *op6,
+ uint8_t *op5, uint8_t *op4, uint8_t *op3,
+ uint8_t *op2, uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+ uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
+ uint8_t *oq6, uint8_t *oq7) {
+ if (flat2 && flat && mask) {
+ const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
+ p2 = *op2, p1 = *op1, p0 = *op0;
+
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+ q5 = *oq5, q6 = *oq6, q7 = *oq7;
+
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ } else {
+ filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
+ p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
+ p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
+ q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+
+#if PARALLEL_DEBLOCKING_11_TAP
+ const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
+
+ filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
+ s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p,
+ s + 3 * p, s + 4 * p, s + 5 * p);
+
+#elif PARALLEL_DEBLOCKING_9_TAP
+ const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
+
+ filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p,
+ s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p,
+ s + 4 * p);
+#else
+ const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
+
+ filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+ s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
+ s + 7 * p);
+#endif
+
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4],
+ p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
+ q5 = s[5], q6 = s[6], q7 = s[7];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+
+#if PARALLEL_DEBLOCKING_11_TAP
+ const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
+
+ filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
+ s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5);
+#elif PARALLEL_DEBLOCKING_9_TAP
+ const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
+
+ filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s,
+ s + 1, s + 2, s + 3, s + 4);
+
+#else
+ const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
+
+ filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
+ s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
+ s + 7);
+#endif
+
+ s += p;
+ }
+}
+
+void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+}
+
+void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+}
+
+#if CONFIG_HIGHBITDEPTH
+#if CONFIG_PARALLEL_DEBLOCKING
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+#endif // CONFIG_PARALLEL_DEBLOCKING
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+ uint16_t p3, uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0, uint16_t q1,
+ uint16_t q2, uint16_t q3, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p3 - p2) > limit16) * -1;
+ mask |= (abs(p2 - p1) > limit16) * -1;
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(q2 - q1) > limit16) * -1;
+ mask |= (abs(q3 - q2) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2, uint16_t q3,
+ int bd) {
+ int8_t mask = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p1 - p0) > thresh16) * -1;
+ mask |= (abs(q1 - q0) > thresh16) * -1;
+ mask |= (abs(p2 - p0) > thresh16) * -1;
+ mask |= (abs(q2 - q0) > thresh16) * -1;
+ mask |= (abs(p3 - p0) > thresh16) * -1;
+ mask |= (abs(q3 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
+ uint16_t p2, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, uint16_t q2,
+ uint16_t q3, uint16_t q4, int bd) {
+ int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p4 - p0) > thresh16) * -1;
+ mask |= (abs(q4 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, int bd) {
+ int16_t hev = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ hev |= (abs(p1 - p0) > thresh16) * -1;
+ hev |= (abs(q1 - q0) > thresh16) * -1;
+ return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ int bd) {
+ int16_t filter1, filter2;
+ // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+ // into -128 to +127 instead of 0 to 255.
+ int shift = bd - 8;
+ const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+ const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+ const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+ const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+ const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+ // Add outer taps if we have high edge variance.
+ int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+ // Inner taps.
+ filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+ // Save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set to adjust by -1 to account for the fact
+ // we'd round 3 the other way.
+ filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+ filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+ *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+ *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+ // Outer tap adjustments.
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+ *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
+ const uint16_t p3 = s[-4 * p];
+ const uint16_t p2 = s[-3 * p];
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const uint16_t q2 = s[2 * p];
+ const uint16_t q3 = s[3 * p];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else // CONFIG_PARALLEL_DEBLOCKING
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const int8_t mask =
+ highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_4_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else // CONFIG_PARALLEL_DEBLOCKING
+ const uint16_t p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1];
+ const int8_t mask =
+ highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+ s += pitch;
+ }
+}
+
+void aom_highbd_lpf_vertical_4_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint16_t *op3, uint16_t *op2, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ uint16_t *oq2, uint16_t *oq3, int bd) {
+ if (flat && mask) {
+ const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ }
+}
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+ s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_8_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+ s + 2, s + 3, bd);
+ s += pitch;
+ }
+}
+
+void aom_highbd_lpf_vertical_8_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint8_t flat2, uint16_t *op7, uint16_t *op6,
+ uint16_t *op5, uint16_t *op4, uint16_t *op3,
+ uint16_t *op2, uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+ uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
+ uint16_t *oq6, uint16_t *oq7, int bd) {
+ if (flat2 && flat && mask) {
+ const uint16_t p7 = *op7;
+ const uint16_t p6 = *op6;
+ const uint16_t p5 = *op5;
+ const uint16_t p4 = *op4;
+ const uint16_t p3 = *op3;
+ const uint16_t p2 = *op2;
+ const uint16_t p1 = *op1;
+ const uint16_t p0 = *op0;
+ const uint16_t q0 = *oq0;
+ const uint16_t q1 = *oq1;
+ const uint16_t q2 = *oq2;
+ const uint16_t q3 = *oq3;
+ const uint16_t q4 = *oq4;
+ const uint16_t q5 = *oq5;
+ const uint16_t q6 = *oq6;
+ const uint16_t q7 = *oq7;
+
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ } else {
+ highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ bd);
+ }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4 * p];
+ const uint16_t p2 = s[-3 * p];
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const uint16_t q2 = s[2 * p];
+ const uint16_t q3 = s[3 * p];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 =
+ highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
+ s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+ s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
+ s + 6 * p, s + 7 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+}
+
+void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4];
+ const uint16_t p2 = s[-3];
+ const uint16_t p1 = s[-2];
+ const uint16_t p0 = s[-1];
+ const uint16_t q0 = s[0];
+ const uint16_t q1 = s[1];
+ const uint16_t q2 = s[2];
+ const uint16_t q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
+ s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
+ s + 5, s + 6, s + 7, bd);
+ s += p;
+ }
+}
+
+void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
new file mode 100644
index 000000000..4c6e201e1
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+ char blackclamp[16], char whiteclamp[16],
+ char bothclamp[16], uint32_t width,
+ uint32_t height, int32_t pitch) {
+ uint32_t i, j;
+
+ for (i = 0; i < height / 2; ++i) {
+ uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+ int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+ uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+ int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+ for (j = width / 16; j--;) {
+ v16i8 temp00_s, temp01_s;
+ v16u8 temp00, temp01, black_clamp, white_clamp;
+ v16u8 pos0, ref0, pos1, ref1;
+ v16i8 const127 = __msa_ldi_b(127);
+
+ pos0 = LD_UB(pos0_ptr);
+ ref0 = LD_UB(ref0_ptr);
+ pos1 = LD_UB(pos1_ptr);
+ ref1 = LD_UB(ref1_ptr);
+ black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+ white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+ temp00 = (pos0 < black_clamp);
+ pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+ temp01 = (pos1 < black_clamp);
+ pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+ XORI_B2_128_UB(pos0, pos1);
+ temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+ temp00 = (v16u8)(temp00_s < pos0);
+ pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+ temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+ temp01 = (temp01_s < pos1);
+ pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+ XORI_B2_128_UB(pos0, pos1);
+ pos0 += ref0;
+ ST_UB(pos0, pos0_ptr);
+ pos1 += ref1;
+ ST_UB(pos1, pos1_ptr);
+ pos0_ptr += 16;
+ pos1_ptr += 16;
+ ref0_ptr += 16;
+ ref1_ptr += 16;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
new file mode 100644
index 000000000..847394a3d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst0, dst1, dst2, dst3, res2, res3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, res0, res1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, res0, res1);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ XORI_B2_128_UB(res2, res3);
+ AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8i16 filt, vec0, vec1, vec2, vec3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec0, vec1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec2, vec3);
+ SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
+ res3);
+ ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+ XORI_B2_128_UB(res0, res2);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
+ ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+ AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+ ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ src += (2 * src_stride);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(dst, 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ for (cnt = 0; cnt < 2; ++cnt) {
+ src0 = LD_SB(&src[cnt << 5]);
+ src2 = LD_SB(&src[16 + (cnt << 5)]);
+ src3 = LD_SB(&src[24 + (cnt << 5)]);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+ vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+ vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+ vec1, vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+ vec1, vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8u16 vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
+ AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+ dst_stride);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+
+ for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB2(dst, 16, dst0, dst1);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+ dst += dst_stride;
+ LD_UB2(dst, 16, dst2, dst3);
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src2, src4, src6);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+ PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+ PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+ PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ default:
+ aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ default:
+ aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
new file mode 100644
index 000000000..bed600d5b
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+ XORI_B2_128_UB(tmp0, tmp1);
+ AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+ ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ vec0 = vec2;
+ vec1 = vec3;
+ vec2 = vec4;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 dst0, dst1, dst2, dst3, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
+ tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
+ res3);
+ AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else {
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0 &&
+ ((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ default:
+ aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else if (((const int32_t *)filter_x)[0] == 0 ||
+ ((const int32_t *)filter_y)[0] == 0) {
+ aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ default:
+ aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
new file mode 100644
index 000000000..dae771104
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3, out;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+ dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
+ filt2, filt3);
+ out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
+ filt2, filt3);
+ out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
+ filt2, filt3);
+ out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height, int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+
+ LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
+ dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+ v16i8 src10_r, src32_r, src21_r, src43_r;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ src4 = LD_SB(src);
+ src += src_stride;
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ v16u8 src2110, src4332, src6554, src8776, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
+ dst3);
+ ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+ dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+ LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB2(src, 16, src0, src5);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5;
+ v16u8 src6, src7, src8, src9, src10, src11, filt0;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(dst + 16, dst_stride, dst2, dst3);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(dst + 32, dst_stride, dst4, dst5);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ LD_UB2(dst + 48, dst_stride, dst6, dst7);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ default:
+ aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+
+ break;
+ case 32:
+ common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ default:
+ aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
new file mode 100644
index 000000000..fc3a823c5
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ SRARI_H2_SH(out0, out1, FILTER_BITS);
+ SAT_SH2_SH(out0, out1, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1, out2,
+ out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (2 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+
+ src0 = LD_SB(src + 32);
+ src2 = LD_SB(src + 48);
+ src3 = LD_SB(src + 56);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst + 32);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 vec0, vec1, vec2, vec3, filt0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16i8 res0, res1, res2, res3;
+ v8u16 vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+ ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask, out0, out1;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ loop_cnt = (height >> 2) - 1;
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ PCKEV_ST_SB(out6, out7, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src4 = LD_SB(src + 32);
+ src6 = LD_SB(src + 48);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ PCKEV_ST_SB(out4, out5, dst + 32);
+ PCKEV_ST_SB(out6, out7, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
new file mode 100644
index 000000000..a4d594931
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ out0 = out2;
+ out1 = out3;
+ out2 = out4;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src7, src8, src9, src10);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(vec0, vec1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16i8 res0, res1, res2, res3;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
+ vec5, vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+ PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+ PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int32_t x_step_q4, const int16_t *filter_y,
+ int32_t y_step_q4, int32_t w, int32_t h) {
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0 &&
+ ((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else if (((const int32_t *)filter_x)[0] == 0 ||
+ ((const int32_t *)filter_y)[0] == 0) {
+ aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ default:
+ aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
new file mode 100644
index 000000000..f7bdfc2bd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v16u8 out;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+ tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+ tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+ v16u8 filt0;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ src8 = LD_SB(src);
+ src += src_stride;
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src5 = LD_UB(src + 16);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
new file mode 100644
index 000000000..75f8c7ea8
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint32_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ if (0 == (height % 4)) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ out2 = __msa_copy_u_w((v4i32)dst2, 0);
+ out3 = __msa_copy_u_w((v4i32)dst3, 0);
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == (height % 2)) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ LD_UB2(dst, dst_stride, dst0, dst1);
+
+ AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ SW(out0, dst);
+ dst += dst_stride;
+ SW(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+
+ out0 = __msa_copy_u_d((v2i64)dst0, 0);
+ out1 = __msa_copy_u_d((v2i64)dst1, 0);
+ out2 = __msa_copy_u_d((v2i64)dst2, 0);
+ out3 = __msa_copy_u_d((v2i64)dst3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+ dst += (8 * dst_stride);
+ }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+ LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+ dst_dup += (4 * dst_stride);
+ LD_UB4(src, src_stride, src8, src10, src12, src14);
+ LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+ LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+ dst_dup += (4 * dst_stride);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+ ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+ ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+ LD_UB4(src, 16, src8, src9, src10, src11);
+ src += src_stride;
+ LD_UB4(src, 16, src12, src13, src14, src15);
+ src += src_stride;
+
+ LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+ dst_dup += dst_stride;
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ avg_width4_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 8: {
+ avg_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
new file mode 100644
index 000000000..f7f116f4d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ for (cnt = height >> 3; cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 4) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 2) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+ SD(out0, dst);
+ dst += dst_stride;
+ SD(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ src_tmp += (8 * src_stride);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+ dst_stride);
+ dst_tmp += (8 * dst_stride);
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+ dst += (8 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt, tmp;
+ /* 1 word storage */
+ for (cnt = h; cnt--;) {
+ tmp = LW(src);
+ SW(tmp, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
new file mode 100644
index 000000000..1a0ae4d8d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
+ filt3) \
+ ({ \
+ v8i16 tmp_dpadd_0, tmp_dpadd_1; \
+ \
+ tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+ tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
+ tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+ tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \
+ \
+ tmp_dpadd_0; \
+ })
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \
+ filt_h1, filt_h2, filt_h3) \
+ ({ \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 hz_out_m; \
+ \
+ VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
+ vec3_m); \
+ hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \
+ filt_h1, filt_h2, filt_h3); \
+ \
+ hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
+ hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
+ \
+ hz_out_m; \
+ })
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
+ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
+ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
+ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
+ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
+ }
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
+ res4_m, res5_m, res6_m, res7_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+ res4_m, res5_m, res6_m, res7_m); \
+ ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
+ res7_m, out0, out1, out2, out3); \
+ }
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = PCKEV_XORI128_UB(in1, in0); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
+ stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
+ }
+#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/avg_msa.c b/third_party/aom/aom_dsp/mips/avg_msa.c
new file mode 100644
index 000000000..0e1728155
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/avg_msa.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+uint32_t aom_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+ v4u32 sum = { 0 };
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
+ HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
+ ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
+ ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
+ sum0 += sum4;
+
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
+ sum_out = __msa_copy_u_w((v4i32)sum, 0);
+
+ return sum_out;
+}
+
+uint32_t aom_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ uint32_t src0, src1, src2, src3;
+ v16u8 vec = { 0 };
+ v8u16 sum0;
+ v4u32 sum1;
+ v2u64 sum2;
+
+ LW4(src, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, vec);
+
+ sum0 = __msa_hadd_u_h(vec, vec);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum2 = __msa_hadd_u_d(sum1, sum1);
+ sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
+ sum_out = __msa_copy_u_w((v4i32)sum1, 0);
+
+ return sum_out;
+}
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c
new file mode 100644
index 000000000..00ab75dc3
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *aom_ff_cropTbl;
+
+void aom_dsputil_static_init(void) {
+ int i;
+
+ for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+ for (i = 0; i < CROP_WIDTH; i++) {
+ aom_ff_cropTbl_a[i] = 0;
+ aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+ }
+
+ aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
new file mode 100644
index 000000000..31159fdcd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_MIPS_DSPR2_H_
+#define AOM_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+ __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+ __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+ __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_MIPS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
new file mode 100644
index 000000000..d557115b9
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ uint32_t pos = 38;
+
+ assert(y_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ w, h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
+ break;
+ default:
+ aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
new file mode 100644
index 000000000..efbdcf60f
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3;
+ uint32_t tn1, tn2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tp4], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tp4], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tp3], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp4], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
+
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
+
+ /* store bytes */
+ "sb %[tp3], 3(%[dst]) \n\t"
+ "sb %[tp4], 5(%[dst]) \n\t"
+ "sb %[tp1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 8:
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 16:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
+ break;
+ case 32:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ default:
+ aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
new file mode 100644
index 000000000..066308315
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp2](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_stride] "r"(dst_stride));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[Temp1], %[p3](%[cm]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[Temp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+ [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ sum += src[x] * filter[3];
+ sum += src[x + 1] * filter[4];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ case 16:
+ case 32:
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h, (w / 16));
+ break;
+ case 64:
+ prefetch_load(src + 32);
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ default:
+ convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+ h);
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
new file mode 100644
index 000000000..dc51ab1cb
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[p1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[p2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[p1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+
+ prefetch_load((const uint8_t *)filter_x);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ default:
+ aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
new file mode 100644
index 000000000..3367be01a
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ uint32_t pos = 38;
+
+ assert(y_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+ break;
+ default:
+ aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
new file mode 100644
index 000000000..298065adb
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
+ break;
+ default:
+ aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ if (intermediate_height < h) intermediate_height = h;
+
+ aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+
+ aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride, int w,
+ int h) {
+ int x, y;
+ uint32_t tp1, tp2, tn1;
+ uint32_t tp3, tp4, tn2;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ /* 1 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+
+ : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 8:
+ /* 2 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 16:
+ /* 4 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 32:
+ /* 8 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_load(src + src_stride + 64);
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 32(%[dst]) \n\t"
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 36(%[src]) \n\t"
+ "ulw %[tp4], 36(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 40(%[src]) \n\t"
+ "ulw %[tp2], 40(%[dst]) \n\t"
+ "sw %[tn1], 32(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 36(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 44(%[src]) \n\t"
+ "ulw %[tp4], 44(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 48(%[src]) \n\t"
+ "ulw %[tp2], 48(%[dst]) \n\t"
+ "sw %[tn1], 40(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 44(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 52(%[src]) \n\t"
+ "ulw %[tp4], 52(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 56(%[src]) \n\t"
+ "ulw %[tp2], 56(%[dst]) \n\t"
+ "sw %[tn1], 48(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 52(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 60(%[src]) \n\t"
+ "ulw %[tp4], 60(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 56(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ default:
+ for (y = h; y > 0; --y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = (dst[x] + src[x] + 1) >> 1;
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
new file mode 100644
index 000000000..f6534b420
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tn3], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tn3], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tn2], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn3], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tn1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
+
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
+
+ /* store bytes */
+ "sb %[tn2], 3(%[dst]) \n\t"
+ "sb %[tn3], 5(%[dst]) \n\t"
+ "sb %[tn1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 8:
+ convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 16:
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
+ break;
+ case 32:
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ default:
+ aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w,
+ h);
+ break;
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
new file mode 100644
index 000000000..c871702f4
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@@ -0,0 +1,1590 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tn1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_stride] "r"(dst_stride));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4, n1;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp2], 0(%[src]) \n\t"
+ "ulw %[tp1], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp1] \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tp3] \n\t"
+ "preceu.ph.qbl %[n1], %[tp3] \n\t"
+ "ulw %[tp2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "lbux %[tp3], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp3], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "ulw %[tp1], 1(%[src]) \n\t"
+ "ulw %[tp3], 5(%[src]) \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[tp2], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "ulw %[tp2], 9(%[src]) \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n1], %[tp2] \n\t"
+ "ulw %[Temp1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[Temp1] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[n1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y, k;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x * dst_stride] = src[x];
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (intermediate_height < h) intermediate_height = h;
+
+ /* copy the src to dst */
+ if (filter_x[3] == 0x80) {
+ copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
+ intermediate_height, w, intermediate_height);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
+ intermediate_height, filter_x, w, intermediate_height);
+ } else {
+ src -= (src_stride * 3 + 3);
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height, (w / 16));
+ break;
+ case 64:
+ prefetch_load(src + 32);
+ convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ default:
+ convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
+ filter_x, w, intermediate_height);
+ break;
+ }
+ }
+
+ /* copy the src to dst */
+ if (filter_y[3] == 0x80) {
+ copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
+ filter_y, h, w);
+ } else {
+ switch (h) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w, (h / 16));
+ break;
+ case 64:
+ convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ default:
+ convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
+ filter_y, h, w);
+ break;
+ }
+ }
+}
+
+void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4: {
+ uint32_t tp1;
+
+ /* 1 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], (%[src]) \n\t"
+ "sw %[tp1], (%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 8: {
+ uint32_t tp1, tp2;
+
+ /* 2 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 16: {
+ uint32_t tp1, tp2, tp3, tp4;
+
+ /* 4 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 32: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ /* 8 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 64: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_load(src + src_stride + 64);
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 36(%[src]) \n\t"
+ "ulw %[tp3], 40(%[src]) \n\t"
+ "ulw %[tp4], 44(%[src]) \n\t"
+ "ulw %[tp5], 48(%[src]) \n\t"
+ "ulw %[tp6], 52(%[src]) \n\t"
+ "ulw %[tp7], 56(%[src]) \n\t"
+ "ulw %[tp8], 60(%[src]) \n\t"
+
+ "sw %[tp1], 32(%[dst]) \n\t" /* store */
+ "sw %[tp2], 36(%[dst]) \n\t" /* store */
+ "sw %[tp3], 40(%[dst]) \n\t" /* store */
+ "sw %[tp4], 44(%[dst]) \n\t" /* store */
+ "sw %[tp5], 48(%[dst]) \n\t" /* store */
+ "sw %[tp6], 52(%[dst]) \n\t" /* store */
+ "sw %[tp7], 56(%[dst]) \n\t" /* store */
+ "sw %[tp8], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ default:
+ for (y = h; y--;) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = src[x];
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
new file mode 100644
index 000000000..c60557617
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[tn1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[n2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[n1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ prefetch_load((const uint8_t *)filter_x);
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ default:
+ aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
new file mode 100644
index 000000000..d8a90b6ab
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = aom_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+ break;
+ default:
+ aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 000000000..f8fd9e2b6
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
+#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
+
+void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h);
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h);
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
new file mode 100644
index 000000000..dc9c63226
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+ int32_t src_stride,
+ int16_t *temp_buff) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 step0, step1, step2, step3;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+ v8i16 step0_1, step1_1, step2_1, step3_1;
+
+ /* 1st and 2nd set */
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+ /* 3rd and 4th set */
+ LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 temp0, temp1;
+
+ /* fdct even */
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ LD_SH4(input + 96, 8, in12, in13, in14, in15);
+ BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
+ vec3, in12, in13, in14, in15);
+ LD_SH4(input + 32, 8, in4, in5, in6, in7);
+ LD_SH4(input + 64, 8, in8, in9, in10, in11);
+ BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp);
+ ST_SH(temp1, temp + 512);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 256);
+ ST_SH(temp1, temp + 768);
+
+ SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 128);
+ ST_SH(temp1, temp + 896);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 640);
+ ST_SH(temp1, temp + 384);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 64);
+ ST_SH(temp1, temp + 960);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 576);
+ ST_SH(temp1, temp + 448);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 320);
+ ST_SH(temp1, temp + 704);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 192);
+ ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(input + 32);
+ in21 = LD_SH(input + 40);
+ in26 = LD_SH(input + 80);
+ in27 = LD_SH(input + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(input + 16);
+ in19 = LD_SH(input + 24);
+ in28 = LD_SH(input + 96);
+ in29 = LD_SH(input + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, input + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, input + 40);
+ vec4 = in29 - in26;
+ ST_SH(vec4, input + 80);
+ vec4 = in28 - in27;
+ ST_SH(vec4, input + 88);
+
+ in21 = in18 + in21;
+ in20 = in19 + in20;
+ in27 = in28 + in27;
+ in26 = in29 + in26;
+
+ LD_SH4(input + 48, 8, in22, in23, in24, in25);
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(input);
+ in17 = LD_SH(input + 8);
+ in30 = LD_SH(input + 112);
+ in31 = LD_SH(input + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, input + 16);
+ vec4 = in16 - in23;
+ ST_SH(vec4, input + 24);
+ vec4 = in31 - in24;
+ ST_SH(vec4, input + 96);
+ vec4 = in30 - in25;
+ ST_SH(vec4, input + 104);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr);
+ ST_SH(vec4, temp_ptr + 960);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 448);
+ ST_SH(vec4, temp_ptr + 512);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 704);
+ ST_SH(vec5, temp_ptr + 256);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 192);
+ ST_SH(vec5, temp_ptr + 768);
+
+ LD_SH4(input + 16, 8, in22, in23, in20, in21);
+ LD_SH4(input + 80, 8, in26, in27, in24, in25);
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 832);
+ ST_SH(vec4, temp_ptr + 128);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 320);
+ ST_SH(vec4, temp_ptr + 640);
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 576);
+ ST_SH(vec4, temp_ptr + 384);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 64);
+ ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+ int16_t *tmp_buf, int16_t *tmp_buf_big) {
+ fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+ fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+ fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+ int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+ LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+ /* 2nd set */
+ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+ (output + 8 * 8), 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+ v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+ v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+ /* Stage 3 */
+ UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+ UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+ UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+ UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+ UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+ UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+ UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+ UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+ ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
+ tmp1_w, tmp2_w, tmp3_w);
+ BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+ ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
+ vec1_r, vec2_r, vec3_r);
+
+ tmp3_w = vec0_r + vec3_r;
+ vec0_r = vec0_r - vec3_r;
+ vec3_r = vec1_r + vec2_r;
+ vec1_r = vec1_r - vec2_r;
+
+ DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out, 8);
+
+ DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out + 16, 8);
+
+ LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 32);
+ ST_SH(in5, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 40);
+ ST_SH(in5, out + 48);
+
+ LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 64);
+ ST_SH(in5, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 72);
+ ST_SH(in5, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 80);
+ ST_SH(in5, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 96);
+ ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+ /* 1st set */
+ in0 = LD_SH(temp);
+ in4 = LD_SH(temp + 32);
+ in2 = LD_SH(temp + 64);
+ in6 = LD_SH(temp + 96);
+ in1 = LD_SH(temp + 128);
+ in7 = LD_SH(temp + 152);
+ in3 = LD_SH(temp + 192);
+ in5 = LD_SH(temp + 216);
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* 2nd set */
+ in0_1 = LD_SH(temp + 16);
+ in1_1 = LD_SH(temp + 232);
+ in2_1 = LD_SH(temp + 80);
+ in3_1 = LD_SH(temp + 168);
+ in4_1 = LD_SH(temp + 48);
+ in5_1 = LD_SH(temp + 176);
+ in6_1 = LD_SH(temp + 112);
+ in7_1 = LD_SH(temp + 240);
+
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ /* 3rd set */
+ in0 = LD_SH(temp + 8);
+ in1 = LD_SH(temp + 136);
+ in2 = LD_SH(temp + 72);
+ in3 = LD_SH(temp + 200);
+ in4 = LD_SH(temp + 40);
+ in5 = LD_SH(temp + 208);
+ in6 = LD_SH(temp + 104);
+ in7 = LD_SH(temp + 144);
+
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
+ 32);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+ /* 4th set */
+ in0_1 = LD_SH(temp + 24);
+ in1_1 = LD_SH(temp + 224);
+ in2_1 = LD_SH(temp + 88);
+ in3_1 = LD_SH(temp + 160);
+ in4_1 = LD_SH(temp + 56);
+ in5_1 = LD_SH(temp + 184);
+ in6_1 = LD_SH(temp + 120);
+ in7_1 = LD_SH(temp + 248);
+
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
+ 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+ fdct8x32_1d_row_even(temp_buf, temp_buf);
+ fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+ fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void aom_fdct32x32_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+ tmp_buf_big + (8 * i));
+ }
+
+ /* row transform */
+ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+ /* row transform */
+ for (i = 1; i < 4; ++i) {
+ fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+ }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+ FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+ FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+ temp0 = in0 + in3;
+ in0 = in0 - in3;
+ in3 = in1 + in2;
+ in1 = in1 - in2;
+
+ DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+ v8i16 vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ FDCT_POSTPROC_2V_NEG_H(in20, in21);
+ FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ FDCT_POSTPROC_2V_NEG_H(in18, in19);
+ FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+ FDCT_POSTPROC_2V_NEG_H(in22, in23);
+ FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ FDCT_POSTPROC_2V_NEG_H(in16, in17);
+ FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ in16 = in28 + in29;
+ in19 = in31 + in30;
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+ fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+ &tmp_buf_big[0] + (8 * i));
+ }
+
+ /* row transform */
+ for (i = 0; i < 4; ++i) {
+ fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+ out + (8 * i * 32));
+ }
+}
+
+void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ int sum = LD_HADD(input, stride);
+ sum += LD_HADD(input + 8, stride);
+ sum += LD_HADD(input + 16, stride);
+ sum += LD_HADD(input + 24, stride);
+ sum += LD_HADD(input + 32 * 8, stride);
+ sum += LD_HADD(input + 32 * 8 + 8, stride);
+ sum += LD_HADD(input + 32 * 8 + 16, stride);
+ sum += LD_HADD(input + 32 * 8 + 24, stride);
+ sum += LD_HADD(input + 32 * 16, stride);
+ sum += LD_HADD(input + 32 * 16 + 8, stride);
+ sum += LD_HADD(input + 32 * 16 + 16, stride);
+ sum += LD_HADD(input + 32 * 16 + 24, stride);
+ sum += LD_HADD(input + 32 * 24, stride);
+ sum += LD_HADD(input + 32 * 24 + 8, stride);
+ sum += LD_HADD(input + 32 * 24 + 16, stride);
+ sum += LD_HADD(input + 32 * 24 + 24, stride);
+ out[0] = (int16_t)(sum >> 3);
+}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
new file mode 100644
index 000000000..f16d290c8
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+ v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+ v8i16 coeff2 = {
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
+ };
+
+ LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in8, in9, in10, in11, 2);
+ SLLI_4V(in12, in13, in14, in15, 2);
+ ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+ ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+ SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+ SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+ ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __msa_splati_h(coeff, 0);
+ stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+ cnst5 = __msa_splati_h(coeff, 1);
+ cnst5 = __msa_ilvev_h(cnst5, cnst4);
+ stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+ stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+ stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+ /* stp2 */
+ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+ ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+ SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+ cnst0 = __msa_splati_h(coeff, 4);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+ stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+ BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ ILVRL_H2_SH(in15, in8, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr);
+
+ cnst0 = __msa_splati_h(coeff2, 0);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 224);
+
+ ILVRL_H2_SH(in14, in9, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 128);
+
+ cnst1 = __msa_splati_h(coeff2, 2);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 96);
+
+ SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ cnst1 = __msa_splati_h(coeff, 3);
+ cnst1 = __msa_ilvev_h(cnst0, cnst1);
+ stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ /* stp4 */
+ ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+ ILVRL_H2_SH(in13, in10, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 64);
+
+ cnst0 = __msa_splati_h(coeff2, 1);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 160);
+
+ SUB2(stp34, stp25, stp33, stp22, in12, in11);
+ ILVRL_H2_SH(in12, in11, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 192);
+
+ cnst1 = __msa_splati_h(coeff2, 3);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 32);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+ LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+ ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+ SRA_4V(in0, in1, in2, in3, 2);
+ SRA_4V(in4, in5, in6, in7, 2);
+ SRA_4V(in8, in9, in10, in11, 2);
+ SRA_4V(in12, in13, in14, in15, 2);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+ tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void aom_fdct4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 vec, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ vec = __msa_ceqi_h(in0, 0);
+ vec = vec ^ 255;
+ vec = mask & vec;
+ in0 += vec;
+ }
+
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
+
+void aom_fdct8x8_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
+
+void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[0] = LD_HADD(input, stride);
+ out[1] = 0;
+}
+
+void aom_fdct16x16_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+
+void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ int sum = LD_HADD(input, stride);
+ sum += LD_HADD(input + 8, stride);
+ sum += LD_HADD(input + 16 * 8, stride);
+ sum += LD_HADD(input + 16 * 8 + 8, stride);
+ out[0] = (int16_t)(sum >> 1);
+}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
new file mode 100644
index 000000000..ada25dffd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_
+#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_
+
+#include "aom_dsp/mips/txfm_macros_msa.h"
+#include "aom_dsp/txfm_common.h"
+
+#define LD_HADD(psrc, stride) \
+ ({ \
+ v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
+ v4i32 vec_w_m; \
+ \
+ LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
+ ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
+ LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
+ ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
+ in0_m, in4_m); \
+ in0_m += in4_m; \
+ \
+ vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
+ HADD_SW_S32(vec_w_m); \
+ })
+
+#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 coeff_m = { \
+ cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
+ }; \
+ \
+ BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
+ vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
+ cnst2_m = __msa_splati_h(coeff_m, 2); \
+ cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
+ vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \
+ vec7_m, out0, out2, out1, out3); \
+ }
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
+ SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
+ AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
+ in2, in3); \
+ AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
+ in6, in7); \
+ }
+
+#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v8i16 x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ { \
+ v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
+ v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \
+ v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \
+ v8i16 coeff2_m = { \
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \
+ }; \
+ \
+ /* stp 1 */ \
+ ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
+ ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __msa_splati_h(coeff_m, 0); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
+ \
+ cnst5_m = __msa_splati_h(coeff_m, 1); \
+ cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
+ stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
+ stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
+ \
+ /* stp2 */ \
+ BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \
+ stp33_m); \
+ BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \
+ stp34_m); \
+ \
+ ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
+ ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 4); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 3); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ /* stp4 */ \
+ BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \
+ vec5_m); \
+ BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
+ stp31_m); \
+ \
+ ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 0); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 2); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 1); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 3); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ }
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one_m = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clti_s_h(vec0, 0); \
+ tp1_m = __msa_clti_s_h(vec1, 0); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one_m & tp0_m; \
+ tp1_m = one_m & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
+
+#define FDCT32_POSTPROC_NEG_W(vec) \
+ { \
+ v4i32 temp_m; \
+ v4i32 one_m = __msa_ldi_w(1); \
+ \
+ temp_m = __msa_clti_s_w(vec, 0); \
+ vec += 1; \
+ temp_m = one_m & temp_m; \
+ vec += temp_m; \
+ vec >>= 2; \
+ }
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clei_s_h(vec0, 0); \
+ tp1_m = __msa_clei_s_h(vec1, 0); \
+ tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
+ tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one & tp0_m; \
+ tp1_m = one & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+ const0, const1, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
+ v4i32 k0_m = __msa_fill_w((int32_t)const0); \
+ \
+ s0_m = __msa_fill_w((int32_t)const1); \
+ k0_m = __msa_ilvev_w(s0_m, k0_m); \
+ \
+ ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
+ ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
+ ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
+ ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
+ \
+ DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ \
+ DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ }
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c
new file mode 100644
index 000000000..0ea127f52
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/idct16x16_msa.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ input += 8;
+ LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
+ reg2, reg3, reg4, reg5, reg6, reg7);
+ TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
+ reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+ SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
+ reg8);
+ ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
+ reg10);
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ /* transpose block */
+ TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
+ reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+ ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
+
+ /* transpose block */
+ TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
+ reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+ ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
+}
+
+void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ /* load up 8x8 */
+ LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ input += 8 * 16;
+ /* load bottom 8x8 */
+ LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+ reg0 = reg2 - loc1;
+ reg2 = reg2 + loc1;
+ reg12 = reg14 - loc0;
+ reg14 = reg14 + loc0;
+ reg4 = reg6 - loc3;
+ reg6 = reg6 + loc3;
+ reg8 = reg10 - loc2;
+ reg10 = reg10 + loc2;
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* process 16 * 8 block */
+ aom_idct16_1d_rows_msa(input, out);
+
+ /* short case just considers top 4 rows as valid output */
+ out += 4 * 16;
+ for (i = 12; i--;) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[out]) \n\t"
+ "sw $zero, 4(%[out]) \n\t"
+ "sw $zero, 8(%[out]) \n\t"
+ "sw $zero, 12(%[out]) \n\t"
+ "sw $zero, 16(%[out]) \n\t"
+ "sw $zero, 20(%[out]) \n\t"
+ "sw $zero, 24(%[out]) \n\t"
+ "sw $zero, 28(%[out]) \n\t"
+
+ :
+ : [out] "r"(out));
+
+ out += 16;
+ }
+
+ out = out_arr;
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t i;
+ int16_t out;
+ v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __msa_fill_h(out);
+
+ for (i = 4; i--;) {
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ CLIP_SH4_0_255(res4, res5, res6, res7);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
+ l7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
+ l12, l13, l14, l15);
+
+ /* ADST in horizontal */
+ AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
+ l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+
+ l1 = -r8;
+ l3 = -r4;
+ l13 = -r13;
+ l15 = -r1;
+
+ TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
+ l6, l7);
+ ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+ TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
+ l13, l14, l15);
+ ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+ v16i8 zero = { 0 };
+
+ r0 = LD_SH(input + 0 * 16);
+ r3 = LD_SH(input + 3 * 16);
+ r4 = LD_SH(input + 4 * 16);
+ r7 = LD_SH(input + 7 * 16);
+ r8 = LD_SH(input + 8 * 16);
+ r11 = LD_SH(input + 11 * 16);
+ r12 = LD_SH(input + 12 * 16);
+ r15 = LD_SH(input + 15 * 16);
+
+ /* stage 1 */
+ k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+ k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+ BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+ k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+ k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ r1 = LD_SH(input + 1 * 16);
+ r2 = LD_SH(input + 2 * 16);
+ r5 = LD_SH(input + 5 * 16);
+ r6 = LD_SH(input + 6 * 16);
+ r9 = LD_SH(input + 9 * 16);
+ r10 = LD_SH(input + 10 * 16);
+ r13 = LD_SH(input + 13 * 16);
+ r14 = LD_SH(input + 14 * 16);
+
+ k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+ k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+ BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+ BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+ out1 = -out1;
+ SRARI_H2_SH(out0, out1, 6);
+ dst0 = LD_UB(dst + 0 * dst_stride);
+ dst1 = LD_UB(dst + 15 * dst_stride);
+ ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+ ADD2(res0, out0, res1, out1, res0, res1);
+ CLIP_SH2_0_255(res0, res1);
+ PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+ ST8x1_UB(res0, dst);
+ ST8x1_UB(res1, dst + 15 * dst_stride);
+
+ k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+ k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+
+ SRARI_H2_SH(out8, out9, 6);
+ dst8 = LD_UB(dst + 1 * dst_stride);
+ dst9 = LD_UB(dst + 14 * dst_stride);
+ ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+ ADD2(res8, out8, res9, out9, res8, res9);
+ CLIP_SH2_0_255(res8, res9);
+ PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+ ST8x1_UB(res8, dst + dst_stride);
+ ST8x1_UB(res9, dst + 14 * dst_stride);
+
+ k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+ k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+ MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ SRARI_H2_SH(out4, out5, 6);
+ dst4 = LD_UB(dst + 3 * dst_stride);
+ dst5 = LD_UB(dst + 12 * dst_stride);
+ ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+ ADD2(res4, out4, res5, out5, res4, res5);
+ CLIP_SH2_0_255(res4, res5);
+ PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+ ST8x1_UB(res4, dst + 3 * dst_stride);
+ ST8x1_UB(res5, dst + 12 * dst_stride);
+
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ SRARI_H2_SH(out12, out13, 6);
+ dst12 = LD_UB(dst + 2 * dst_stride);
+ dst13 = LD_UB(dst + 13 * dst_stride);
+ ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+ ADD2(res12, out12, res13, out13, res12, res13);
+ CLIP_SH2_0_255(res12, res13);
+ PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+ ST8x1_UB(res12, dst + 2 * dst_stride);
+ ST8x1_UB(res13, dst + 13 * dst_stride);
+
+ k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+ k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ SRARI_H2_SH(out6, out7, 6);
+ dst6 = LD_UB(dst + 4 * dst_stride);
+ dst7 = LD_UB(dst + 11 * dst_stride);
+ ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+ ADD2(res6, out6, res7, out7, res6, res7);
+ CLIP_SH2_0_255(res6, res7);
+ PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+ ST8x1_UB(res6, dst + 4 * dst_stride);
+ ST8x1_UB(res7, dst + 11 * dst_stride);
+
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ SRARI_H2_SH(out10, out11, 6);
+ dst10 = LD_UB(dst + 6 * dst_stride);
+ dst11 = LD_UB(dst + 9 * dst_stride);
+ ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+ ADD2(res10, out10, res11, out11, res10, res11);
+ CLIP_SH2_0_255(res10, res11);
+ PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+ ST8x1_UB(res10, dst + 6 * dst_stride);
+ ST8x1_UB(res11, dst + 9 * dst_stride);
+
+ k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ SRARI_H2_SH(out2, out3, 6);
+ dst2 = LD_UB(dst + 7 * dst_stride);
+ dst3 = LD_UB(dst + 8 * dst_stride);
+ ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+ ADD2(res2, out2, res3, out3, res2, res3);
+ CLIP_SH2_0_255(res2, res3);
+ PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+ ST8x1_UB(res2, dst + 7 * dst_stride);
+ ST8x1_UB(res3, dst + 8 * dst_stride);
+
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ SRARI_H2_SH(out14, out15, 6);
+ dst14 = LD_UB(dst + 5 * dst_stride);
+ dst15 = LD_UB(dst + 10 * dst_stride);
+ ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+ ADD2(res14, out14, res15, out15, res14, res15);
+ CLIP_SH2_0_255(res14, res15);
+ PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+ ST8x1_UB(res14, dst + 5 * dst_stride);
+ ST8x1_UB(res15, dst + 10 * dst_stride);
+}
diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c
new file mode 100644
index 000000000..f1ca757a0
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/idct32x32_msa.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+ int16_t *tmp_buf) {
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* 1st & 2nd 8x8 */
+ LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
+ ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
+ ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
+
+ /* 3rd & 4th 8x8 */
+ LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
+ ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
+ ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
+ ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 15 * 8));
+ ST_SH(loc1, (tmp_eve_buf));
+ ST_SH(loc2, (tmp_eve_buf + 14 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 8));
+
+ BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 13 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 2 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 12 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+ /* Store 8 */
+ BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 11 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 4 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 10 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+ BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 9 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 6 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 8 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LD_SH(tmp_buf + 8);
+ reg1 = LD_SH(tmp_buf + 7 * 8);
+ reg2 = LD_SH(tmp_buf + 9 * 8);
+ reg3 = LD_SH(tmp_buf + 15 * 8);
+ reg4 = LD_SH(tmp_buf + 17 * 8);
+ reg5 = LD_SH(tmp_buf + 23 * 8);
+ reg6 = LD_SH(tmp_buf + 25 * 8);
+ reg7 = LD_SH(tmp_buf + 31 * 8);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ reg0 = LD_SH(tmp_buf + 3 * 8);
+ reg1 = LD_SH(tmp_buf + 5 * 8);
+ reg2 = LD_SH(tmp_buf + 11 * 8);
+ reg3 = LD_SH(tmp_buf + 13 * 8);
+ reg4 = LD_SH(tmp_buf + 19 * 8);
+ reg5 = LD_SH(tmp_buf + 21 * 8);
+ reg6 = LD_SH(tmp_buf + 27 * 8);
+ reg7 = LD_SH(tmp_buf + 29 * 8);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+ /* 4 Stores */
+ ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
+ BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ ST_SH(reg0, (tmp_odd_buf + 13 * 8));
+ ST_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+ /* Load 8 & Store 8 */
+ LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+ LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+ SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Load 8 & Store 8 */
+ LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+ LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, int16_t *dst) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ vec0 = LD_SH(tmp_odd_buf);
+ vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LD_SH(tmp_eve_buf);
+ loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+ /* Transpose : 16 vectors */
+ /* 1st & 2nd 8x8 */
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+ ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+ ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+ /* 3rd & 4th 8x8 */
+ LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+ ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+ ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct32x8_row_transpose_store(input, &tmp_buf[0]);
+ idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+ idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+ idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+ output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ tmp_buf += (2 * 32);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ /* Load 8 */
+ LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ /* Store 8 */
+ BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+ BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+ /* Store 8 */
+ BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+ BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LD_SH(tmp_buf + 32);
+ reg1 = LD_SH(tmp_buf + 7 * 32);
+ reg2 = LD_SH(tmp_buf + 9 * 32);
+ reg3 = LD_SH(tmp_buf + 15 * 32);
+ reg4 = LD_SH(tmp_buf + 17 * 32);
+ reg5 = LD_SH(tmp_buf + 23 * 32);
+ reg6 = LD_SH(tmp_buf + 25 * 32);
+ reg7 = LD_SH(tmp_buf + 31 * 32);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+ SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ reg0 = LD_SH(tmp_buf + 3 * 32);
+ reg1 = LD_SH(tmp_buf + 5 * 32);
+ reg2 = LD_SH(tmp_buf + 11 * 32);
+ reg3 = LD_SH(tmp_buf + 13 * 32);
+ reg4 = LD_SH(tmp_buf + 19 * 32);
+ reg5 = LD_SH(tmp_buf + 21 * 32);
+ reg6 = LD_SH(tmp_buf + 27 * 32);
+ reg7 = LD_SH(tmp_buf + 29 * 32);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+ /* 4 Stores */
+ ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
+ BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+ /* Load 8 & Store 8 */
+ LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+ LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+ SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Load 8 & Store 8 */
+ LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+ LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ vec0 = LD_SH(tmp_odd_buf);
+ vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LD_SH(tmp_eve_buf);
+ loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+ SRARI_H4_SH(m0, m2, m4, m6, 6);
+ AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+ SRARI_H4_SH(m0, m2, m4, m6, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
+ m6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+ SRARI_H4_SH(m1, m3, m5, m7, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+ SRARI_H4_SH(m1, m3, m5, m7, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
+ m7);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+ SRARI_H4_SH(n0, n2, n4, n6, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+ SRARI_H4_SH(n0, n2, n4, n6, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
+ n6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+ SRARI_H4_SH(n1, n3, n5, n7, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+ SRARI_H4_SH(n1, n3, n5, n7, 6);
+ AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
+ n7);
+}
+
+static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+ idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+ idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+ dst_stride);
+}
+
+void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 4; ++i) {
+ /* process 32 * 8 block */
+ idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ for (i = 32; i--;) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[out_ptr]) \n\t"
+ "sw $zero, 4(%[out_ptr]) \n\t"
+ "sw $zero, 8(%[out_ptr]) \n\t"
+ "sw $zero, 12(%[out_ptr]) \n\t"
+ "sw $zero, 16(%[out_ptr]) \n\t"
+ "sw $zero, 20(%[out_ptr]) \n\t"
+ "sw $zero, 24(%[out_ptr]) \n\t"
+ "sw $zero, 28(%[out_ptr]) \n\t"
+ "sw $zero, 32(%[out_ptr]) \n\t"
+ "sw $zero, 36(%[out_ptr]) \n\t"
+ "sw $zero, 40(%[out_ptr]) \n\t"
+ "sw $zero, 44(%[out_ptr]) \n\t"
+ "sw $zero, 48(%[out_ptr]) \n\t"
+ "sw $zero, 52(%[out_ptr]) \n\t"
+ "sw $zero, 56(%[out_ptr]) \n\t"
+ "sw $zero, 60(%[out_ptr]) \n\t"
+
+ :
+ : [out_ptr] "r"(out_ptr));
+
+ out_ptr += 32;
+ }
+
+ out_ptr = out_arr;
+
+ /* rows: only upper-left 8x8 has non-zero coeff */
+ idct32x8_1d_rows_msa(input, out_ptr);
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ int16_t out;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __msa_fill_h(out);
+
+ for (i = 16; i--;) {
+ LD_UB2(dst, 16, dst0, dst1);
+ LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ CLIP_SH4_0_255(res4, res5, res6, res7);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
+
+ ST_UB2(tmp0, tmp1, dst, 16);
+ dst += dst_stride;
+ ST_UB2(tmp2, tmp3, dst, 16);
+ dst += dst_stride;
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c
new file mode 100644
index 000000000..274818baa
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/idct4x4_msa.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3;
+ v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
+
+ /* load vector elements of 4x4 block */
+ LD4x4_SH(input, in0, in2, in3, in1);
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+ UNPCK_R_SH_SW(in0, in0_r);
+ UNPCK_R_SH_SW(in2, in2_r);
+ UNPCK_R_SH_SW(in3, in3_r);
+ UNPCK_R_SH_SW(in1, in1_r);
+ SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
+
+ in0_r += in2_r;
+ in3_r -= in1_r;
+ in4_r = (in0_r - in3_r) >> 1;
+ in1_r = in4_r - in1_r;
+ in2_r = in4_r - in2_r;
+ in0_r -= in1_r;
+ in3_r += in2_r;
+
+ TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
+
+ in0_r += in1_r;
+ in2_r -= in3_r;
+ in4_r = (in0_r - in2_r) >> 1;
+ in3_r = in4_r - in3_r;
+ in1_r = in4_r - in1_r;
+ in0_r -= in3_r;
+ in2_r += in1_r;
+
+ PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
+ in2, in3);
+ ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
+}
+
+void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t a1, e1;
+ v8i16 in1, in0 = { 0 };
+
+ a1 = input[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+
+ in0 = __msa_insert_h(in0, 0, a1);
+ in0 = __msa_insert_h(in0, 1, e1);
+ in0 = __msa_insert_h(in0, 2, e1);
+ in0 = __msa_insert_h(in0, 3, e1);
+
+ in1 = in0 >> 1;
+ in0 -= in1;
+
+ ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
+}
+
+void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3;
+
+ /* load vector elements of 4x4 block */
+ LD4x4_SH(input, in0, in1, in2, in3);
+ /* rows */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* columns */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* rounding (add 2^3, divide by 2^4) */
+ SRARI_H4_SH(in0, in1, in2, in3, 4);
+ ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t out;
+ v8i16 vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 4);
+ vec = __msa_fill_h(out);
+
+ ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c
new file mode 100644
index 000000000..981c103cd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/idct8x8_msa.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ /* load vector elements of 8x8 block */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+ /* rows transform */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* 1D idct8x8 */
+ AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* columns transform */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* 1D idct8x8 */
+ AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* final rounding (add 2^4, divide by 2^5) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 5);
+ SRARI_H4_SH(in4, in5, in6, in7, 5);
+ /* add block and store 8x8 */
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+ dst += (4 * dst_stride);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+ v4i32 tmp0, tmp1, tmp2, tmp3;
+ v8i16 zero = { 0 };
+
+ /* load vector elements of 8x8 block */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ /* stage1 */
+ ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+ k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+ k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+ DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+ SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+ PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+ BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+ /* stage2 */
+ ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+ k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+ k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+ k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+ k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+ DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+ SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+ PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+ BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+ /* stage3 */
+ s0 = __msa_ilvr_h(s6, s5);
+
+ k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+ DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+ SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+ /* stage4 */
+ BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
+ in7);
+ TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* final rounding (add 2^4, divide by 2^5) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 5);
+ SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+ /* add block and store 8x8 */
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+ dst += (4 * dst_stride);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t out;
+ int32_t val;
+ v8i16 vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ val = ROUND_POWER_OF_TWO(out, 5);
+ vec = __msa_fill_h(val);
+
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+ dst += (4 * dst_stride);
+ AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
new file mode 100644
index 000000000..dc8f20208
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+ "lb %[tmp9], 8(%[left]) \n\t"
+ "lb %[tmp10], 9(%[left]) \n\t"
+ "lb %[tmp11], 10(%[left]) \n\t"
+ "lb %[tmp12], 11(%[left]) \n\t"
+ "lb %[tmp13], 12(%[left]) \n\t"
+ "lb %[tmp14], 13(%[left]) \n\t"
+ "lb %[tmp15], 14(%[left]) \n\t"
+ "lb %[tmp16], 15(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+ "replv.qb %[tmp9], %[tmp9] \n\t"
+ "replv.qb %[tmp10], %[tmp10] \n\t"
+ "replv.qb %[tmp11], %[tmp11] \n\t"
+ "replv.qb %[tmp12], %[tmp12] \n\t"
+ "replv.qb %[tmp13], %[tmp13] \n\t"
+ "replv.qb %[tmp14], %[tmp14] \n\t"
+ "replv.qb %[tmp15], %[tmp15] \n\t"
+ "replv.qb %[tmp16], %[tmp16] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "sw %[tmp1], 8(%[dst]) \n\t"
+ "sw %[tmp1], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "sw %[tmp2], 8(%[dst]) \n\t"
+ "sw %[tmp2], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "sw %[tmp3], 8(%[dst]) \n\t"
+ "sw %[tmp3], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "sw %[tmp4], 8(%[dst]) \n\t"
+ "sw %[tmp4], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "sw %[tmp5], 8(%[dst]) \n\t"
+ "sw %[tmp5], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "sw %[tmp6], 8(%[dst]) \n\t"
+ "sw %[tmp6], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "sw %[tmp7], 8(%[dst]) \n\t"
+ "sw %[tmp7], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+ "sw %[tmp8], 8(%[dst]) \n\t"
+ "sw %[tmp8], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp9], (%[dst]) \n\t"
+ "sw %[tmp9], 4(%[dst]) \n\t"
+ "sw %[tmp9], 8(%[dst]) \n\t"
+ "sw %[tmp9], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp10], (%[dst]) \n\t"
+ "sw %[tmp10], 4(%[dst]) \n\t"
+ "sw %[tmp10], 8(%[dst]) \n\t"
+ "sw %[tmp10], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp11], (%[dst]) \n\t"
+ "sw %[tmp11], 4(%[dst]) \n\t"
+ "sw %[tmp11], 8(%[dst]) \n\t"
+ "sw %[tmp11], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp12], (%[dst]) \n\t"
+ "sw %[tmp12], 4(%[dst]) \n\t"
+ "sw %[tmp12], 8(%[dst]) \n\t"
+ "sw %[tmp12], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp13], (%[dst]) \n\t"
+ "sw %[tmp13], 4(%[dst]) \n\t"
+ "sw %[tmp13], 8(%[dst]) \n\t"
+ "sw %[tmp13], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp14], (%[dst]) \n\t"
+ "sw %[tmp14], 4(%[dst]) \n\t"
+ "sw %[tmp14], 8(%[dst]) \n\t"
+ "sw %[tmp14], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp15], (%[dst]) \n\t"
+ "sw %[tmp15], 4(%[dst]) \n\t"
+ "sw %[tmp15], 8(%[dst]) \n\t"
+ "sw %[tmp15], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp16], (%[dst]) \n\t"
+ "sw %[tmp16], 4(%[dst]) \n\t"
+ "sw %[tmp16], 8(%[dst]) \n\t"
+ "sw %[tmp16], 12(%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+ [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+ [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+ [tmp16] "=&r"(tmp16)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, left2;
+
+ __asm__ __volatile__(
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "lw %[above1], 8(%[above]) \n\t"
+ "lw %[above2], 12(%[above]) \n\t"
+ "lw %[left1], 8(%[left]) \n\t"
+ "lw %[left2], 12(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addiu %[average], %[average], 16 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 5 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+ [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+ [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+ [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
new file mode 100644
index 000000000..ea7c02810
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "sw %[tmp1], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+ __asm__ __volatile__(
+ "lw %[above_c], (%[above]) \n\t"
+ "lw %[left_c], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l], %[above_c] \n\t"
+ "preceu.ph.qbr %[above_r], %[above_c] \n\t"
+ "preceu.ph.qbl %[left_l], %[left_c] \n\t"
+ "preceu.ph.qbr %[left_r], %[left_c] \n\t"
+
+ "addu.ph %[average], %[above_r], %[above_l] \n\t"
+ "addu.ph %[average], %[average], %[left_l] \n\t"
+ "addu.ph %[average], %[average], %[left_r] \n\t"
+ "addiu %[average], %[average], 4 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 3 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+
+ : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+ [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+ [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+
+void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t left0, left1, left2, left3;
+ int32_t res0, res1;
+ int32_t resl;
+ int32_t resr;
+ int32_t top_left;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ __asm__ __volatile__(
+ "ulw %[resl], (%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+ "lbu %[left1], 1(%[left]) \n\t"
+ "lbu %[left2], 2(%[left]) \n\t"
+ "lbu %[left3], 3(%[left]) \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[resl] \n\t"
+ "preceu.ph.qbr %[abover], %[resl] \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "replv.ph %[left1], %[left1] \n\t"
+ "replv.ph %[left2], %[left2] \n\t"
+ "replv.ph %[left3], %[left3] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left0] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left0] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left1] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left1] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left2] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left2] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left3] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left3] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
+ [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
+ [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
+ [resr] "=&r"(resr), [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
new file mode 100644
index 000000000..1114fbc00
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+ __asm__ __volatile__(
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "preceu.ph.qbl %[above_l2], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r2], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l2], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r2], %[left2] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l2] \n\t"
+ "addu.ph %[average], %[average], %[above_r2] \n\t"
+ "addu.ph %[average], %[average], %[left_l2] \n\t"
+ "addu.ph %[average], %[average], %[left_r2] \n\t"
+
+ "addiu %[average], %[average], 8 \n\t"
+
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 4 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+ [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+ [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+ [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+ [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+ [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+
+void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t abovel_1, abover_1;
+ int32_t left0;
+ int32_t res0, res1, res2, res3;
+ int32_t reshw;
+ int32_t top_left;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ __asm__ __volatile__(
+ "ulw %[reshw], (%[above]) \n\t"
+ "ulw %[top_left], 4(%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[reshw] \n\t"
+ "preceu.ph.qbr %[abover], %[reshw] \n\t"
+ "preceu.ph.qbl %[abovel_1], %[top_left] \n\t"
+ "preceu.ph.qbr %[abover_1], %[top_left] \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+ "replv.ph %[left0], %[left0] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 1(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 2(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 3(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 4(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 5(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 6(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 7(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
+ [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
+ [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
+ [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
+ [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
new file mode 100644
index 000000000..e8eaec7a9
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+ { \
+ out0 = __msa_subs_u_h(out0, in0); \
+ out1 = __msa_subs_u_h(out1, in1); \
+ }
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t src_data;
+
+ src_data = LW(src);
+
+ SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint32_t src_data1, src_data2;
+
+ src_data1 = LW(src);
+ src_data2 = LW(src + 4);
+
+ for (row = 8; row--;) {
+ SW(src_data1, dst);
+ SW(src_data2, (dst + 4));
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 src0;
+
+ src0 = LD_UB(src);
+
+ for (row = 16; row--;) {
+ ST_UB(src0, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 src1, src2;
+
+ src1 = LD_UB(src);
+ src2 = LD_UB(src + 16);
+
+ for (row = 32; row--;) {
+ ST_UB2(src1, src2, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t out0, out1, out2, out3;
+
+ out0 = src[0] * 0x01010101;
+ out1 = src[1] * 0x01010101;
+ out2 = src[2] * 0x01010101;
+ out3 = src[3] * 0x01010101;
+
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ out0 = src[0] * 0x0101010101010101ull;
+ out1 = src[1] * 0x0101010101010101ull;
+ out2 = src[2] * 0x0101010101010101ull;
+ out3 = src[3] * 0x0101010101010101ull;
+ out4 = src[4] * 0x0101010101010101ull;
+ out5 = src[5] * 0x0101010101010101ull;
+ out6 = src[6] * 0x0101010101010101ull;
+ out7 = src[7] * 0x0101010101010101ull;
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 4; row--;) {
+ inp0 = src[0];
+ inp1 = src[1];
+ inp2 = src[2];
+ inp3 = src[3];
+ src += 4;
+
+ src0 = (v16u8)__msa_fill_b(inp0);
+ src1 = (v16u8)__msa_fill_b(inp1);
+ src2 = (v16u8)__msa_fill_b(inp2);
+ src3 = (v16u8)__msa_fill_b(inp3);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 8; row--;) {
+ inp0 = src[0];
+ inp1 = src[1];
+ inp2 = src[2];
+ inp3 = src[3];
+ src += 4;
+
+ src0 = (v16u8)__msa_fill_b(inp0);
+ src1 = (v16u8)__msa_fill_b(inp1);
+ src2 = (v16u8)__msa_fill_b(inp2);
+ src3 = (v16u8)__msa_fill_b(inp3);
+
+ ST_UB2(src0, src0, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src1, src1, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src2, src2, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src3, src3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val0, val1;
+ v16i8 store, src = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LW(src_top);
+ val1 = LW(src_left);
+ INSERT_W2_SB(val0, val1, src);
+ sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val0;
+ v16i8 store, data = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+
+ val0 = LW(src);
+ data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+ sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+ uint32_t out;
+ const v16i8 store = __msa_ldi_b(128);
+
+ out = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val0, val1;
+ v16i8 store;
+ v16u8 src = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LD(src_top);
+ val1 = LD(src_left);
+ INSERT_D2_UB(val0, val1, src);
+ sum_h = __msa_hadd_u_h(src, src);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val0;
+ v16i8 store;
+ v16u8 data = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LD(src);
+ data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+ sum_h = __msa_hadd_u_h(data, data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+ uint64_t out;
+ const v16i8 store = __msa_ldi_b(128);
+
+ out = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(out, out, out, out, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ v16u8 top, left, out;
+ v8u16 sum_h, sum_top, sum_left;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ top = LD_UB(src_top);
+ left = LD_UB(src_left);
+ HADD_UB2_UH(top, left, sum_top, sum_left);
+ sum_h = sum_top + sum_left;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ v16u8 data, out;
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ data = LD_UB(src);
+ sum_h = __msa_hadd_u_h(data, data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+ const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 top0, top1, left0, left1, out;
+ v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ LD_UB2(src_top, 16, top0, top1);
+ LD_UB2(src_left, 16, left0, left1);
+ HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+ HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+ sum_h = sum_top0 + sum_top1;
+ sum_h += sum_left0 + sum_left1;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 data0, data1, out;
+ v8u16 sum_h, sum_data0, sum_data1;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ LD_UB2(src, 16, data0, data1);
+ HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+ sum_h = sum_data0 + sum_data1;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+ uint32_t row;
+ const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val;
+ uint8_t top_left = src_top_ptr[-1];
+ v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+ v16u8 src0, src1, src2, src3;
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+ val = LW(src_top_ptr);
+ src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val;
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+ v16u8 src0, src1, src2, src3;
+
+ val = LD(src_top_ptr);
+ src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r, res_l;
+
+ src_top = LD_SB(src_top_ptr);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t top_left = src_top[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+ LD_SB2(src_top, 16, src_top0, src_top1);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
new file mode 100644
index 000000000..8a85e26f3
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/inv_txfm.h"
+#include "aom_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \
+ ({ \
+ \
+ int32_t tmp, out; \
+ int dct_cost_rounding = DCT_CONST_ROUNDING; \
+ int in = input; \
+ \
+ __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac1 " \
+ " \n\t" \
+ "mthi $zero, $ac1 " \
+ " \n\t" \
+ "madd $ac1, %[in], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[tmp], $ac1, " \
+ "31 \n\t" \
+ \
+ /* out = dct_const_round_shift(out * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac2 " \
+ " \n\t" \
+ "mthi $zero, $ac2 " \
+ " \n\t" \
+ "madd $ac2, %[tmp], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[out], $ac2, " \
+ "31 \n\t" \
+ \
+ : [tmp] "=&r"(tmp), [out] "=r"(out) \
+ : [in] "r"(in), \
+ [dct_cost_rounding] "r"(dct_cost_rounding), \
+ [cospi_16_64] "r"(cospi_16_64)); \
+ out; \
+ })
+
+void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output);
+void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+void iadst4_dspr2(const int16_t *input, int16_t *output);
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+void iadst8_dspr2(const int16_t *input, int16_t *output);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
+void iadst16_dspr2(const int16_t *input, int16_t *output);
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
new file mode 100644
index 000000000..122667aa8
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_
+#define AOM_DSP_MIPS_INV_TXFM_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/mips/txfm_macros_msa.h"
+#include "aom_dsp/txfm_common.h"
+
+#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
+ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
+ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
+ cospi_24_64, -cospi_24_64, 0, 0 }; \
+ \
+ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in7, in0, in4, in3); \
+ \
+ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in5, in2, in6, in1); \
+ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
+ out7 = -s0_m; \
+ out0 = s1_m; \
+ \
+ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
+ \
+ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ cnst1_m = cnst0_m; \
+ \
+ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
+ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
+ \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ out1 = -out1; \
+ out3 = -out3; \
+ out5 = -out5; \
+ }
+
+#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \
+ ({ \
+ v8i16 out0_m, r0_m, r1_m; \
+ \
+ r0_m = __msa_fill_h(c0_h); \
+ r1_m = __msa_fill_h(c1_h); \
+ out0_m = __msa_ilvev_h(r1_m, r0_m); \
+ \
+ out0_m; \
+ })
+
+#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
+ { \
+ uint8_t *dst_m = (uint8_t *)(dst); \
+ v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ v16i8 zero_m = { 0 }; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
+ ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
+ res0_m, res1_m, res2_m, res3_m); \
+ ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
+ res2_m, res3_m); \
+ CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
+ PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
+ }
+
+#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 c0_m, c1_m, c2_m, c3_m; \
+ v8i16 step0_m, step1_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ step0_m = __msa_ilvr_h(in2, in0); \
+ DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ \
+ c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ step1_m = __msa_ilvr_h(in3, in1); \
+ DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ \
+ PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
+ SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
+ BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
+ out0, out1, out2, out3); \
+ }
+
+#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 res0_m, res1_m, c0_m, c1_m; \
+ v8i16 k1_m, k2_m, k3_m, k4_m; \
+ v8i16 zero_m = { 0 }; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v4i32 int0_m, int1_m, int2_m, int3_m; \
+ v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
+ -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
+ \
+ SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
+ ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
+ int0_m = tmp2_m + tmp1_m; \
+ \
+ SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
+ ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int1_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int2_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ c0_m = __msa_ilvev_h(c0_m, k1_m); \
+ \
+ res0_m = __msa_ilvr_h((in1), (in3)); \
+ tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
+ int3_m = tmp2_m + tmp0_m; \
+ \
+ res0_m = __msa_ilvr_h((in2), (in3)); \
+ c1_m = __msa_ilvev_h(k4_m, k3_m); \
+ \
+ tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
+ res1_m = __msa_ilvr_h((in0), (in2)); \
+ c1_m = __msa_ilvev_h(k1_m, zero_m); \
+ \
+ tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
+ int3_m += tmp2_m; \
+ int3_m += tmp3_m; \
+ \
+ SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
+ PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
+ }
+
+#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
+ ({ \
+ v8i16 c0_m, c1_m; \
+ \
+ SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
+ c0_m = __msa_ilvev_h(c1_m, c0_m); \
+ \
+ c0_m; \
+ })
+
+/* multiply and add macro */
+#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
+ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
+ cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
+ SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \
+ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
+ cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
+ SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \
+ }
+
+/* idct 8x8 macro */
+#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
+ cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
+ \
+ k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \
+ k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \
+ k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \
+ k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \
+ AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
+ SUB2(in1, in3, in7, in5, res0_m, res1_m); \
+ k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \
+ k1_m = __msa_splati_h(mask_m, 4); \
+ \
+ ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
+ DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ tp4_m = in1 + in3; \
+ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
+ tp7_m = in7 + in5; \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
+ BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
+ BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
+ out1, out2, out3, out4, out5, out6, out7); \
+ }
+
+#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
+ v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
+ v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
+ cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
+ v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
+ -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
+ v8i16 mask3_m = { \
+ -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
+ }; \
+ \
+ k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \
+ k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \
+ ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \
+ k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \
+ ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
+ k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \
+ k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \
+ ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \
+ k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \
+ ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
+ ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
+ BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
+ k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \
+ k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \
+ ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
+ r6_m, r7_m); \
+ ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
+ SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
+ k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \
+ k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \
+ ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
+ m1_m, m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
+ ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
+ m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
+ \
+ out1 = -in1; \
+ out3 = -in3; \
+ out5 = -in5; \
+ out7 = -in7; \
+ }
+
+#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
+ r12, r13, r14, r15, out0, out1, out2, out3, out4, \
+ out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
+ v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
+ v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
+ v8i16 h8_m, h9_m, h10_m, h11_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m; \
+ \
+ /* stage 1 */ \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
+ k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
+ MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
+ k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
+ MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
+ k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
+ MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
+ g11_m); \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
+ k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
+ MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
+ g15_m); \
+ \
+ /* stage 2 */ \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
+ k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
+ MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
+ h3_m); \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
+ k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
+ MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
+ h6_m, h7_m); \
+ BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
+ BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
+ h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
+ \
+ /* stage 3 */ \
+ BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
+ MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
+ out7); \
+ MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
+ out13, out15); \
+ \
+ /* stage 4 */ \
+ k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
+ k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
+ MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
+ MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
+ MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
+ MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
+ }
+
+void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride);
+void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
+void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride);
+void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
+#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
new file mode 100644
index 000000000..c63b1e857
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
@@ -0,0 +1,1190 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_10, step1_11, step1_12, step1_13;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+
+ for (i = no_rows; i--;) {
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 16));
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step2_12] \n\t"
+ "add %[load5], %[load5], %[step2_15] \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step2_13] \n\t"
+ "add %[load6], %[load6], %[step2_14] \n\t"
+ "sh %[load5], 0(%[output]) \n\t"
+ "sh %[load6], 32(%[output]) \n\t"
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "add %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+ "add %[load6], %[load6], %[step2_11] \n\t"
+ "sh %[load5], 192(%[output]) \n\t"
+ "sh %[load6], 224(%[output]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "sub %[load5], %[load5], %[step2_11] \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step2_9] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "sh %[load5], 256(%[output]) \n\t"
+ "sh %[load6], 288(%[output]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_14] \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_15] \n\t"
+ "sh %[load5], 448(%[output]) \n\t"
+ "sh %[load6], 480(%[output]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
+ [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
+ [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
+ [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
+ [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
+
+ __asm__ __volatile__(
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sh %[load5], 64(%[output]) \n\t"
+ "sh %[load6], 96(%[output]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sh %[load5], 128(%[output]) \n\t"
+ "sh %[load6], 160(%[output]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sh %[load5], 320(%[output]) \n\t"
+ "sh %[load6], 352(%[output]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sh %[load5], 384(%[output]) \n\t"
+ "sh %[load6], 416(%[output]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
+
+ input += 16;
+ output += 1;
+ }
+}
+
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_8, step1_9, step1_10, step1_11;
+ int step1_12, step1_13, step1_14, step1_15;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ /* prefetch aom_ff_cropTbl */
+ prefetch_load(aom_ff_cropTbl);
+ prefetch_load(aom_ff_cropTbl + 32);
+ prefetch_load(aom_ff_cropTbl + 64);
+ prefetch_load(aom_ff_cropTbl + 96);
+ prefetch_load(aom_ff_cropTbl + 128);
+ prefetch_load(aom_ff_cropTbl + 160);
+ prefetch_load(aom_ff_cropTbl + 192);
+ prefetch_load(aom_ff_cropTbl + 224);
+
+ for (i = 0; i < 16; ++i) {
+ dest_pix = (dest + i);
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+ step1_8 = step2_8 + step2_11;
+ step1_9 = step2_9 + step2_10;
+ step1_14 = step2_13 + step2_14;
+ step1_15 = step2_12 + step2_15;
+
+ __asm__ __volatile__(
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step1_15] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step1_14] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[load5], %[step1_9] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step1_8] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step1_8] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step1_9] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step1_14] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step1_15] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
+ :
+ [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+ [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
+
+ input += 16;
+ }
+}
+
+void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct16_rows_dspr2(input, out, 16);
+
+ // Then transform columns and add to dest
+ idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ idct16_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+ for (i = 0; i < 6; ++i) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 128(%[outptr]) \n\t"
+ "sw $zero, 160(%[outptr]) \n\t"
+ "sw $zero, 192(%[outptr]) \n\t"
+ "sw $zero, 224(%[outptr]) \n\t"
+ "sw $zero, 256(%[outptr]) \n\t"
+ "sw $zero, 288(%[outptr]) \n\t"
+ "sw $zero, 320(%[outptr]) \n\t"
+ "sw $zero, 352(%[outptr]) \n\t"
+ "sw $zero, 384(%[outptr]) \n\t"
+ "sw $zero, 416(%[outptr]) \n\t"
+ "sw $zero, 448(%[outptr]) \n\t"
+ "sw $zero, 480(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ outptr += 2;
+ }
+
+ // Then transform columns
+ idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst16_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = output[8] = output[9] = output[10] =
+ output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = dct_const_round_shift(s0 + s8);
+ x1 = dct_const_round_shift(s1 + s9);
+ x2 = dct_const_round_shift(s2 + s10);
+ x3 = dct_const_round_shift(s3 + s11);
+ x4 = dct_const_round_shift(s4 + s12);
+ x5 = dct_const_round_shift(s5 + s13);
+ x6 = dct_const_round_shift(s6 + s14);
+ x7 = dct_const_round_shift(s7 + s15);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
+ x10 = dct_const_round_shift(s2 - s10);
+ x11 = dct_const_round_shift(s3 - s11);
+ x12 = dct_const_round_shift(s4 - s12);
+ x13 = dct_const_round_shift(s5 - s13);
+ x14 = dct_const_round_shift(s6 - s14);
+ x15 = dct_const_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = dct_const_round_shift(s8 + s12);
+ x9 = dct_const_round_shift(s9 + s13);
+ x10 = dct_const_round_shift(s10 + s14);
+ x11 = dct_const_round_shift(s11 + s15);
+ x12 = dct_const_round_shift(s8 - s12);
+ x13 = dct_const_round_shift(s9 - s13);
+ x14 = dct_const_round_shift(s10 - s14);
+ x15 = dct_const_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = dct_const_round_shift(s4 + s6);
+ x5 = dct_const_round_shift(s5 + s7);
+ x6 = dct_const_round_shift(s4 - s6);
+ x7 = dct_const_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = dct_const_round_shift(s12 + s14);
+ x13 = dct_const_round_shift(s13 + s15);
+ x14 = dct_const_round_shift(s12 - s14);
+ x15 = dct_const_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = dct_const_round_shift(s2);
+ x3 = dct_const_round_shift(s3);
+ x6 = dct_const_round_shift(s6);
+ x7 = dct_const_round_shift(s7);
+ x10 = dct_const_round_shift(s10);
+ x11 = dct_const_round_shift(s11);
+ x14 = dct_const_round_shift(s14);
+ x15 = dct_const_round_shift(s15);
+
+ output[0] = x0;
+ output[1] = -x8;
+ output[2] = x12;
+ output[3] = -x4;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
+ output[13] = -x13;
+ output[14] = x9;
+ output[15] = -x1;
+}
+
+#endif // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
new file mode 100644
index 000000000..d469d1ad0
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
@@ -0,0 +1,1042 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+ int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+ int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+ int16_t step3_28, step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i, temp21;
+ uint8_t *dest_pix, *dest_pix1;
+ const int const_2_power_13 = 8192;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ /* prefetch aom_ff_cropTbl */
+ prefetch_load(aom_ff_cropTbl);
+ prefetch_load(aom_ff_cropTbl + 32);
+ prefetch_load(aom_ff_cropTbl + 64);
+ prefetch_load(aom_ff_cropTbl + 96);
+ prefetch_load(aom_ff_cropTbl + 128);
+ prefetch_load(aom_ff_cropTbl + 160);
+ prefetch_load(aom_ff_cropTbl + 192);
+ prefetch_load(aom_ff_cropTbl + 224);
+
+ for (i = 0; i < 32; ++i) {
+ dest_pix = dest + i;
+ dest_pix1 = dest + i + 31 * dest_stride;
+
+ __asm__ __volatile__(
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+ [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+ [step1_31] "=r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+ [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+ [step1_29] "=r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+ [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+ [step1_27] "=r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+ [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+ [step1_25] "=r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+ [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+ [step2_15] "=r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+ [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+ [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+ [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+ [step3_15] "=r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r"(step3_18)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+ [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r"(step3_19)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+ [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r"(step3_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r"(step3_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ // stage 7
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+ [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+ [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+ [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_2], %[step2_29] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_3], %[step2_28] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
+ [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+ [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
+ [step2_31] "r"(step2_31));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_6], %[step1_25] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_7], %[step1_24] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+ [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+ [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
+ [step1_27] "r"(step1_27));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_10], %[step1_21] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_11], %[step1_20] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+ [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
+ [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+ [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
+ [step1_23] "r"(step1_23));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_14], %[step2_17] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_15], %[step2_16] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+ [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
+ [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
+ [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+ [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+ [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+
+ input += 32;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
new file mode 100644
index 000000000..fa7703217
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int16_t step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int16_t step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int temp21;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int32_t *input_int;
+
+ for (i = no_rows; i--;) {
+ input_int = (const int32_t *)input;
+
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+ input += 32;
+
+ __asm__ __volatile__(
+ "sh $zero, 0(%[output]) \n\t"
+ "sh $zero, 64(%[output]) \n\t"
+ "sh $zero, 128(%[output]) \n\t"
+ "sh $zero, 192(%[output]) \n\t"
+ "sh $zero, 256(%[output]) \n\t"
+ "sh $zero, 320(%[output]) \n\t"
+ "sh $zero, 384(%[output]) \n\t"
+ "sh $zero, 448(%[output]) \n\t"
+ "sh $zero, 512(%[output]) \n\t"
+ "sh $zero, 576(%[output]) \n\t"
+ "sh $zero, 640(%[output]) \n\t"
+ "sh $zero, 704(%[output]) \n\t"
+ "sh $zero, 768(%[output]) \n\t"
+ "sh $zero, 832(%[output]) \n\t"
+ "sh $zero, 896(%[output]) \n\t"
+ "sh $zero, 960(%[output]) \n\t"
+ "sh $zero, 1024(%[output]) \n\t"
+ "sh $zero, 1088(%[output]) \n\t"
+ "sh $zero, 1152(%[output]) \n\t"
+ "sh $zero, 1216(%[output]) \n\t"
+ "sh $zero, 1280(%[output]) \n\t"
+ "sh $zero, 1344(%[output]) \n\t"
+ "sh $zero, 1408(%[output]) \n\t"
+ "sh $zero, 1472(%[output]) \n\t"
+ "sh $zero, 1536(%[output]) \n\t"
+ "sh $zero, 1600(%[output]) \n\t"
+ "sh $zero, 1664(%[output]) \n\t"
+ "sh $zero, 1728(%[output]) \n\t"
+ "sh $zero, 1792(%[output]) \n\t"
+ "sh $zero, 1856(%[output]) \n\t"
+ "sh $zero, 1920(%[output]) \n\t"
+ "sh $zero, 1984(%[output]) \n\t"
+
+ :
+ : [output] "r"(output));
+
+ output += 1;
+
+ continue;
+ }
+
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 32));
+ prefetch_load((const uint8_t *)(input + 48));
+
+ __asm__ __volatile__(
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+ [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+ [step1_31] "=r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+ [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+ [step1_29] "=r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+ [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+ [step1_27] "=r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+ [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+ [step1_25] "=r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+ [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+ [step2_15] "=r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+ [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+ [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+ [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+ [step3_15] "=r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r"(step3_18)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+ [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r"(step3_19)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+ [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r"(step3_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r"(step3_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64)
+
+ );
+
+ __asm__ __volatile__(
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+ [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+ [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+ [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ // final stage
+ output[0 * 32] = step1_0 + step2_31;
+ output[1 * 32] = step1_1 + step2_30;
+ output[2 * 32] = step1_2 + step2_29;
+ output[3 * 32] = step1_3 + step2_28;
+ output[4 * 32] = step1_4 + step1_27;
+ output[5 * 32] = step1_5 + step1_26;
+ output[6 * 32] = step1_6 + step1_25;
+ output[7 * 32] = step1_7 + step1_24;
+ output[8 * 32] = step1_8 + step1_23;
+ output[9 * 32] = step1_9 + step1_22;
+ output[10 * 32] = step1_10 + step1_21;
+ output[11 * 32] = step1_11 + step1_20;
+ output[12 * 32] = step1_12 + step2_19;
+ output[13 * 32] = step1_13 + step2_18;
+ output[14 * 32] = step1_14 + step2_17;
+ output[15 * 32] = step1_15 + step2_16;
+ output[16 * 32] = step1_15 - step2_16;
+ output[17 * 32] = step1_14 - step2_17;
+ output[18 * 32] = step1_13 - step2_18;
+ output[19 * 32] = step1_12 - step2_19;
+ output[20 * 32] = step1_11 - step1_20;
+ output[21 * 32] = step1_10 - step1_21;
+ output[22 * 32] = step1_9 - step1_22;
+ output[23 * 32] = step1_8 - step1_23;
+ output[24 * 32] = step1_7 - step1_24;
+ output[25 * 32] = step1_6 - step1_25;
+ output[26 * 32] = step1_5 - step1_26;
+ output[27 * 32] = step1_4 - step1_27;
+ output[28 * 32] = step1_3 - step2_28;
+ output[29 * 32] = step1_2 - step2_29;
+ output[30 * 32] = step1_1 - step2_30;
+ output[31 * 32] = step1_0 - step2_31;
+
+ input += 32;
+ output += 1;
+ }
+}
+
+void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ idct32_rows_dspr2(input, outptr, 32);
+
+ // Columns
+ aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ idct32_rows_dspr2(input, outptr, 8);
+
+ outptr += 8;
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ for (i = 0; i < 31; ++i) {
+ outptr += 32;
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+ }
+
+ // Columns
+ aom_idct32_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
new file mode 100644
index 000000000..e6d0367cd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+
+ for (i = 4; i--;) {
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+
+ "add %[Temp1], %[step_1], %[step_2] \n\t"
+ "sh %[Temp1], 8(%[output]) \n\t"
+
+ "sub %[Temp2], %[step_1], %[step_2] \n\t"
+ "sh %[Temp2], 16(%[output]) \n\t"
+
+ "sub %[Temp3], %[step_0], %[step_3] \n\t"
+ "sh %[Temp3], 24(%[output]) \n\t"
+
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
+
+ input += 4;
+ output += 1;
+ }
+}
+
+void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+ uint8_t *dest_pix;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ /* prefetch aom_ff_cropTbl */
+ prefetch_load(aom_ff_cropTbl);
+ prefetch_load(aom_ff_cropTbl + 32);
+ prefetch_load(aom_ff_cropTbl + 64);
+ prefetch_load(aom_ff_cropTbl + 96);
+ prefetch_load(aom_ff_cropTbl + 128);
+ prefetch_load(aom_ff_cropTbl + 160);
+ prefetch_load(aom_ff_cropTbl + 192);
+ prefetch_load(aom_ff_cropTbl + 224);
+
+ for (i = 0; i < 4; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
+ [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+ [dest_stride] "r"(dest_stride));
+
+ input += 4;
+ }
+}
+
+void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ aom_idct4_rows_dspr2(input, outptr);
+
+ // Columns
+ aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 8 \n\t"
+ "sra %[a1], %[out], 4 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[dest]) \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst4_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = dct_const_round_shift(s0);
+ output[1] = dct_const_round_shift(s1);
+ output[2] = dct_const_round_shift(s2);
+ output[3] = dct_const_round_shift(s3);
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
new file mode 100644
index 000000000..0a20f76f2
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ const int const_2_power_13 = 8192;
+ int Temp0, Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ for (i = no_rows; i--;) {
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[Temp4], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[Temp4], %[Temp1] \n\t"
+ "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+ "add %[Temp1], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp1], 16(%[output]) \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp0], 32(%[output]) \n\t"
+ "add %[Temp1], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp1], 48(%[output]) \n\t"
+
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp0], 64(%[output]) \n\t"
+ "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp1], 80(%[output]) \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp0], 96(%[output]) \n\t"
+ "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp1], 112(%[output]) \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
+ [input] "r"(input));
+
+ input += 8;
+ output += 1;
+ }
+}
+
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int Temp0, Temp1, Temp2, Temp3;
+ int i;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = aom_ff_cropTbl;
+
+ /* prefetch aom_ff_cropTbl */
+ prefetch_load(aom_ff_cropTbl);
+ prefetch_load(aom_ff_cropTbl + 32);
+ prefetch_load(aom_ff_cropTbl + 64);
+ prefetch_load(aom_ff_cropTbl + 96);
+ prefetch_load(aom_ff_cropTbl + 128);
+ prefetch_load(aom_ff_cropTbl + 160);
+ prefetch_load(aom_ff_cropTbl + 192);
+ prefetch_load(aom_ff_cropTbl + 224);
+
+ for (i = 0; i < 8; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[step1_6], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[step1_6], %[Temp1] \n\t"
+ "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /* add block */
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+ [dest_stride] "r"(dest_stride));
+
+ input += 8;
+ }
+}
+
+void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct8_rows_dspr2(input, outptr, 8);
+
+ // Then transform columns and add to dest
+ idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct8_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 48(%[outptr]) \n\t"
+ "sw $zero, 52(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 68(%[outptr]) \n\t"
+ "sw $zero, 80(%[outptr]) \n\t"
+ "sw $zero, 84(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 100(%[outptr]) \n\t"
+ "sw $zero, 112(%[outptr]) \n\t"
+ "sw $zero, 116(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ // Then transform columns and add to dest
+ idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 16 \n\t"
+ "sra %[a1], %[out], 5 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+ : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst8_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3, x4, x5, x6, x7;
+
+ x0 = input[7];
+ x1 = input[0];
+ x2 = input[5];
+ x3 = input[2];
+ x4 = input[3];
+ x5 = input[4];
+ x6 = input[1];
+ x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+ x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+ x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+ x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+ output[0] = x0;
+ output[1] = -x4;
+ output[2] = x6;
+ output[3] = -x2;
+ output[4] = x3;
+ output[5] = -x7;
+ output[6] = x5;
+ output[7] = -x1;
+}
+#endif // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 000000000..fc0c32ce3
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
@@ -0,0 +1,1487 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+ v16u8 flat, flat2, filter8;
+ v16i8 zero = { 0 };
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+ v8i16 l_out, r_out;
+
+ flat = LD_UB(filter48 + 96);
+
+ LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+ AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ src -= 3 * pitch;
+ ST_UB4(p2, p1, p0, q0, src, pitch);
+ src += (4 * pitch);
+ ST_UB2(q1, q2, src, pitch);
+ } else {
+ src -= 7 * pitch;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+ p5_l_in, p4_l_in);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+ p1_l_in, p0_l_in);
+ q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST_UB(p6, src);
+ src += pitch;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST_UB(p5, src);
+ src += pitch;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+ q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST_UB(p4, src);
+ src += pitch;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST_UB(p3, src);
+ src += pitch;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST_UB(q3, src);
+ src += pitch;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST_UB(q4, src);
+ src += pitch;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST_UB(q5, src);
+ src += pitch;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST_UB(q6, src);
+ }
+}
+
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr, int32_t count) {
+ DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+ uint8_t early_exit = 0;
+
+ (void)count;
+
+ early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+ limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ aom_hz_lpf_t16_16w(src, pitch, filter48);
+ }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr, int32_t count) {
+ if (1 == count) {
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ uint64_t dword0, dword1;
+ v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 p0_filter16, p1_filter16;
+ v8i16 p2_filter8, p1_filter8, p0_filter8;
+ v8i16 q0_filter8, q1_filter8, q2_filter8;
+ v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+ v16i8 zero = { 0 };
+ v8u16 tmp0, tmp1, tmp2;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+ q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+ } else {
+ /* convert 8 bit input data into 16 bit */
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+ zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+ q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+ PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+ /* load 16 vector elements */
+ LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+ LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+ AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+ SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+ SD(q1_d, src + pitch);
+ SD(q2_d, src + 2 * pitch);
+ } else {
+ /* LSB(right) 8 pixel operation */
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+ zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+ q7_r);
+
+ tmp0 = p7_r << 3;
+ tmp0 -= p7_r;
+ tmp0 += p6_r;
+ tmp0 += q0_r;
+
+ src -= 7 * pitch;
+
+ /* calculation of p6 and p5 */
+ tmp1 = p6_r + p5_r + p4_r + p3_r;
+ tmp1 += (p2_r + p1_r + p0_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp0 = p5_r - p6_r + q1_r - p7_r;
+ tmp1 += tmp0;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p4 and p3 */
+ tmp0 = p4_r - p5_r + q2_r - p7_r;
+ tmp2 = p3_r - p4_r + q3_r - p7_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p2 and p1 */
+ tmp0 = p2_r - p3_r + q4_r - p7_r;
+ tmp2 = p1_r - p2_r + q5_r - p7_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p0 and q0 */
+ tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+ tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q1 and q2 */
+ tmp0 = q7_r - q0_r + q1_r - p6_r;
+ tmp2 = q7_r - q1_r + q2_r - p5_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q3 and q4 */
+ tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+ tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q5 and q6 */
+ tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+ tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ }
+ }
+ } else {
+ aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+ thresh_ptr, count);
+ }
+}
+
+void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+ uint8_t *output, int32_t out_pitch) {
+ v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+ v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+ LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+ p1_org, p0_org);
+ /* 8x8 transpose */
+ TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+ p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+ /* 8x8 transpose */
+ ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+ ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+ ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+ ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+ SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+ ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+ output += (8 * out_pitch);
+ ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+ uint8_t *output, int32_t out_pitch) {
+ v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+ LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+ TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+ q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+ ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+ int32_t out_pitch) {
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+ v4i32 tmp2, tmp3;
+
+ LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ input += (8 * in_pitch);
+ LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p7, p6,
+ p5, p4, p3, p2, p1, p0);
+
+ /* transpose 16x8 matrix into 8x16 */
+ /* total 8 intermediate register and 32 instructions */
+ q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+ q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+ q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+ q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+ q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+ q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+ q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+ q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+ ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+ tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+ tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+ ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+ tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+ tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+ ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+ q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+ tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+ q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+ q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+ tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+ q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+ output += (8 * out_pitch);
+ ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch_org,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v16i8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3;
+
+ /* load vector elements */
+ LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ /* convert 16 bit output data into 8 bit */
+ p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+ p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+ p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+ q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+ q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+ q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
+ v16i8 zero = { 0 };
+ v16u8 filter8, flat, flat2;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 tmp0_r, tmp1_r;
+ v8i16 r_out;
+
+ flat = LD_UB(filter48 + 6 * 16);
+
+ LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+ AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ v8i16 vec0, vec1, vec2, vec3, vec4;
+
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+ src_org -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+ return 1;
+ } else {
+ src -= 7 * 16;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST8x1_UB(p6, src);
+ src += 16;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST8x1_UB(p5, src);
+ src += 16;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST8x1_UB(p4, src);
+ src += 16;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST8x1_UB(p3, src);
+ src += 16;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST8x1_UB(q3, src);
+ src += 16;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST8x1_UB(q4, src);
+ src += 16;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST8x1_UB(q5, src);
+ src += 16;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST8x1_UB(q6, src);
+
+ return 0;
+ }
+}
+
+void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+ early_exit =
+ aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+ &filter48[0]);
+
+ if (0 == early_exit) {
+ transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+ }
+ }
+}
+
+int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16i8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+ /* load vector elements */
+ LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+ src_org -= 2;
+ ST4x8_UB(vec2, vec3, src_org, pitch);
+ src_org += 8 * pitch;
+ ST4x8_UB(vec4, vec5, src_org, pitch);
+
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
+ v16u8 flat, flat2, filter8;
+ v16i8 zero = { 0 };
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+ v8i16 l_out, r_out;
+
+ flat = LD_UB(filter48 + 6 * 16);
+
+ LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+ AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+ ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+ src_org -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+ return 1;
+ } else {
+ src -= 7 * 16;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+ p5_l_in, p4_l_in);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+ p1_l_in, p0_l_in);
+ q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST_UB(p6, src);
+ src += 16;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST_UB(p5, src);
+ src += 16;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST_UB(p4, src);
+ src += 16;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST_UB(p3, src);
+ src += 16;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST_UB(q3, src);
+ src += 16;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST_UB(q4, src);
+ src += 16;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST_UB(q5, src);
+ src += 16;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST_UB(q6, src);
+
+ return 0;
+ }
+}
+
+void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+ early_exit =
+ aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+ &filter48[0]);
+
+ if (0 == early_exit) {
+ transpose_16x16(transposed_input, 16, (src - 8), pitch);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 000000000..dc0a97764
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint64_t p1_d, p0_d, q0_d, q1_d;
+ v16u8 mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+ ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 mask, hev, flat, limit, thresh, b_limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v8i16 vec0, vec1, vec2, vec3;
+
+ LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+ src -= 2;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ src += 4 * pitch;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat;
+ v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+ row14, row15);
+
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+ ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+ src -= 2;
+
+ ST4x8_UB(tmp2, tmp3, src, pitch);
+ src += (8 * pitch);
+ ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 000000000..dc203e79c
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ v16u8 mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+ v16i8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+ PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+ p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+ src -= 3 * pitch;
+
+ SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+ src += (4 * pitch);
+ SD(q1_d, src);
+ src += pitch;
+ SD(q2_d, src);
+ }
+}
+
+void aom_lpf_horizontal_8_dual_msa(
+ uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh0);
+ tmp = (v16u8)__msa_fill_b(*thresh1);
+ thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit0);
+ tmp = (v16u8)__msa_fill_b(*b_limit1);
+ b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+ limit = (v16u8)__msa_fill_b(*limit0);
+ tmp = (v16u8)__msa_fill_b(*limit1);
+ limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ src -= 3 * pitch;
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+ src += (4 * pitch);
+ ST_UB2(q1_out, q2_out, src, pitch);
+ src += (2 * pitch);
+ }
+}
+
+void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p1_out, p0_out, q0_out, q1_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v16u8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4;
+
+ /* load vector elements */
+ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ /* Store 4 pixels p1-_q1 */
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+ src -= 2;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ src += 4 * pitch;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+ p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ /* Store 6 pixels p2-_q2 */
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+ src -= 3;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec4, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec4, 4, src + 4, pitch);
+ }
+}
+
+void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8_t *temp_src;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p1_out, p0_out, q0_out, q1_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ temp_src = src - 4;
+
+ LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+ temp_src += (8 * pitch);
+ LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+ /* transpose 16x8 matrix into 8x16 */
+ TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+ row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh0);
+ vec0 = (v8i16)__msa_fill_b(*thresh1);
+ thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit0);
+ vec0 = (v8i16)__msa_fill_b(*b_limit1);
+ b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+ limit = (v16u8)__msa_fill_b(*limit0);
+ vec0 = (v8i16)__msa_fill_b(*limit1);
+ limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+ src -= 2;
+ ST4x8_UB(vec2, vec3, src, pitch);
+ src += 8 * pitch;
+ ST4x8_UB(vec4, vec5, src, pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+ /* filter8 */
+ AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+ ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+ src -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec2, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec2, 4, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec5, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec5, 4, src + 4, pitch);
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
new file mode 100644
index 000000000..883d0523d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ of 8 bit simd instructions. */
+ for (i = 0; i < 2; i++) {
+ sm1 = s - (pitch << 2);
+ s0 = sm1 + pitch;
+ s1 = s0 + pitch;
+ s2 = s - pitch;
+ s3 = s;
+ s4 = s + pitch;
+ s5 = s4 + pitch;
+ s6 = s5 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p1], (%[s1]) \n\t"
+ "lw %[p2], (%[s2]) \n\t"
+ "lw %[p3], (%[s3]) \n\t"
+ "lw %[p4], (%[s4]) \n\t"
+
+ : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ mask will be zero and filtering is not needed */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ __asm__ __volatile__(
+ "lw %[pm1], (%[sm1]) \n\t"
+ "lw %[p0], (%[s0]) \n\t"
+ "lw %[p5], (%[s5]) \n\t"
+ "lw %[p6], (%[s6]) \n\t"
+
+ : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+ : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
+
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ __asm__ __volatile__(
+ "sw %[p1], (%[s1]) \n\t"
+ "sw %[p2], (%[s2]) \n\t"
+ "sw %[p3], (%[s3]) \n\t"
+ "sw %[p4], (%[s4]) \n\t"
+
+ :
+ : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+ [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s2] "r"(s2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s1] "r"(s1));
+ }
+ }
+ }
+}
+
+void aom_lpf_horizontal_4_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+ aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
new file mode 100644
index 000000000..72df09823
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -0,0 +1,735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+ uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+ int32_t aom_filter_l, aom_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__(
+ /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* aom_filter &= hev; */
+ "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
+ "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
+
+ /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+
+ /* aom_filter &= mask; */
+ "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
+ "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
+
+ : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
+
+ /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+ __asm__ __volatile__(
+ /* (aom_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* aom_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__(
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *ps0 = vps0 ^ N128;
+ *ps1 = vps1 ^ N128;
+ *qs0 = vqs0 ^ N128;
+ *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+ uint32_t ps0, uint32_t qs0, uint32_t qs1,
+ uint32_t *p1_f0, uint32_t *p0_f0,
+ uint32_t *q0_f0, uint32_t *q1_f0) {
+ int32_t aom_filter_l, aom_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (ps0) ^ N128;
+ vps1 = (ps1) ^ N128;
+ vqs0 = (qs0) ^ N128;
+ vqs1 = (qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__(
+ /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* aom_filter &= hev; */
+ "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
+ "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
+
+ /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
+
+ /* aom_filter &= mask; */
+ "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
+ "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
+
+ : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
+
+ /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+ __asm__ __volatile__(
+ /* (aom_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* aom_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__(
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *p0_f0 = vps0 ^ N128;
+ *p1_f0 = vps1 ^ N128;
+ *q0_f0 = vqs0 ^ N128;
+ *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+ uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__(
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+}
+
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3, uint32_t *op2_f1,
+ uint32_t *op1_f1, uint32_t *op0_f1,
+ uint32_t *oq0_f1, uint32_t *oq1_f1,
+ uint32_t *oq2_f1) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__(
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+ *op2_f1 = res_op2;
+ *op1_f1 = res_op1;
+ *op0_f1 = res_op0;
+ *oq0_f1 = res_oq0;
+ *oq1_f1 = res_oq1;
+ *oq2_f1 = res_oq2;
+}
+
+static INLINE void wide_mbfilter_dspr2(
+ uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+ uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+ uint32_t *oq7) {
+ const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+ uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+ uint32_t tmp;
+ uint32_t add_p6toq6;
+ uint32_t u32Eight = 0x00080008;
+
+ __asm__ __volatile__(
+ /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+ which is used most of the time */
+ "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
+
+ : [add_p6toq6] "=&r"(add_p6toq6)
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+ [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [u32Eight] "r"(u32Eight));
+
+ __asm__ __volatile__(
+ /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+ p3 + p2 + p1 + p0 + q0, 4) */
+ "shll.ph %[tmp], %[p7], 3 \n\t"
+ "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
+ "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
+
+ /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+ p2 + p1 + p0 + q0 + q1, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
+ "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
+
+ /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+ p1 + p0 + q0 + q1 + q2, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
+ "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
+
+ /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+ p1 + p0 + q0 + q1 + q2 + q3, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
+ "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
+
+ /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+ p0 + q0 + q1 + q2 + q3 + q4, 4) */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
+
+ /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+ p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
+
+ /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+ "addu.ph %[res_op0], %[p7], %[p0] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
+
+ : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+ [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [add_p6toq6] "r"(add_p6toq6));
+
+ *op6 = res_op6;
+ *op5 = res_op5;
+ *op4 = res_op4;
+ *op3 = res_op3;
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+
+ __asm__ __volatile__(
+ /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+ q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+ "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
+
+ /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+ q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
+
+ /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+ q3 + q4 + q5 + q6 + q7 * 3, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
+
+ /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+ q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
+ "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
+
+ /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+ q4 * 2 + q5 + q6 + q7 * 5, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
+ "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
+
+ /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+ q5 * 2 + q6 + q7 * 6, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
+ "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
+
+ /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+ q4 + q5 + q6 * 2 + q7 * 7, 4) */
+ "shll.ph %[tmp], %[q7], 3 \n\t"
+ "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
+ "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
+
+ : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+ [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+ [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+ [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+ : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+ [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+ [add_p6toq6] "r"(add_p6toq6));
+
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+ *oq3 = res_oq3;
+ *oq4 = res_oq4;
+ *oq5 = res_oq5;
+ *oq6 = res_oq6;
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
new file mode 100644
index 000000000..3e6994714
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s4]) \n\t" \
+ "sb %[q0_f0], 0(%[s4]) \n\t" \
+ "sb %[p0_f0], -1(%[s4]) \n\t" \
+ "sb %[p1_f0], -2(%[s4]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s3]) \n\t" \
+ "sb %[q0_f0], 0(%[s3]) \n\t" \
+ "sb %[p0_f0], -1(%[s3]) \n\t" \
+ "sb %[p1_f0], -2(%[s3]) \n\t" \
+ \
+ : [p1_f0] "+r"(p1_f0) \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \
+ [p0_f0] "r"(p0_f0)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s2]) \n\t" \
+ "sb %[q0_f0], 0(%[s2]) \n\t" \
+ "sb %[p0_f0], -1(%[s2]) \n\t" \
+ "sb %[p1_f0], -2(%[s2]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s1]) \n\t" \
+ "sb %[q0_f0], 0(%[s1]) \n\t" \
+ "sb %[p0_f0], -1(%[s1]) \n\t" \
+ "sb %[p1_f0], -2(%[s1]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \
+ }
+
+#define STORE_F1() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ \
+ : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \
+ [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ \
+ : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \
+ [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+ }
+
+#define STORE_F2() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s4]) \n\t" \
+ "sb %[q5_r], 5(%[s4]) \n\t" \
+ "sb %[q4_r], 4(%[s4]) \n\t" \
+ "sb %[q3_r], 3(%[s4]) \n\t" \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ "sb %[p3_r], -4(%[s4]) \n\t" \
+ "sb %[p4_r], -5(%[s4]) \n\t" \
+ "sb %[p5_r], -6(%[s4]) \n\t" \
+ "sb %[p6_r], -7(%[s4]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_r], %[q6_r], 16 \n\t" \
+ "srl %[q5_r], %[q5_r], 16 \n\t" \
+ "srl %[q4_r], %[q4_r], 16 \n\t" \
+ "srl %[q3_r], %[q3_r], 16 \n\t" \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ "srl %[p3_r], %[p3_r], 16 \n\t" \
+ "srl %[p4_r], %[p4_r], 16 \n\t" \
+ "srl %[p5_r], %[p5_r], 16 \n\t" \
+ "srl %[p6_r], %[p6_r], 16 \n\t" \
+ \
+ : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+ [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+ [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+ [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+ [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s3]) \n\t" \
+ "sb %[q5_r], 5(%[s3]) \n\t" \
+ "sb %[q4_r], 4(%[s3]) \n\t" \
+ "sb %[q3_r], 3(%[s3]) \n\t" \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ "sb %[p3_r], -4(%[s3]) \n\t" \
+ "sb %[p4_r], -5(%[s3]) \n\t" \
+ "sb %[p5_r], -6(%[s3]) \n\t" \
+ "sb %[p6_r], -7(%[s3]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s2]) \n\t" \
+ "sb %[q5_l], 5(%[s2]) \n\t" \
+ "sb %[q4_l], 4(%[s2]) \n\t" \
+ "sb %[q3_l], 3(%[s2]) \n\t" \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ "sb %[p3_l], -4(%[s2]) \n\t" \
+ "sb %[p4_l], -5(%[s2]) \n\t" \
+ "sb %[p5_l], -6(%[s2]) \n\t" \
+ "sb %[p6_l], -7(%[s2]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_l], %[q6_l], 16 \n\t" \
+ "srl %[q5_l], %[q5_l], 16 \n\t" \
+ "srl %[q4_l], %[q4_l], 16 \n\t" \
+ "srl %[q3_l], %[q3_l], 16 \n\t" \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ "srl %[p3_l], %[p3_l], 16 \n\t" \
+ "srl %[p4_l], %[p4_l], 16 \n\t" \
+ "srl %[p5_l], %[p5_l], 16 \n\t" \
+ "srl %[p6_l], %[p6_l], 16 \n\t" \
+ \
+ : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+ [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+ [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+ [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+ [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s1]) \n\t" \
+ "sb %[q5_l], 5(%[s1]) \n\t" \
+ "sb %[q4_l], 4(%[s1]) \n\t" \
+ "sb %[q3_l], 3(%[s1]) \n\t" \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ "sb %[p3_l], -4(%[s1]) \n\t" \
+ "sb %[p4_l], -5(%[s1]) \n\t" \
+ "sb %[p5_l], -6(%[s1]) \n\t" \
+ "sb %[p6_l], -7(%[s1]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \
+ }
+
+#define PACK_LEFT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
+ "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
+ "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
+ "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
+ "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
+ "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
+ "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
+ "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
+ \
+ : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+ [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+ [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
+
+#define PACK_LEFT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
+ "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
+ "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
+ "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
+ "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
+ "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
+ "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
+ "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
+ \
+ : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+ [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+ [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
+
+#define PACK_RIGHT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
+ "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
+ "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
+ "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
+ "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
+ "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
+ "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
+ "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
+ \
+ : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+ [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+ [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
+
+#define PACK_RIGHT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
+ "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
+ "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
+ "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
+ "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
+ "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
+ "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
+ "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
+ \
+ : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+ [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+ [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
+
+#define COMBINE_LEFT_RIGHT_0TO2() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
+ "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
+ "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
+ "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
+ "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
+ "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
+ \
+ : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+ [q1] "=&r"(q1), [q2] "=&r"(q2) \
+ : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \
+ [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \
+ [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \
+ [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \
+ }
+
+#define COMBINE_LEFT_RIGHT_3TO6() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
+ "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
+ "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
+ "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
+ "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
+ "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
+ "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
+ "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
+ \
+ : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+ [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \
+ [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \
+ [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \
+ [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \
+ [q6_r] "r"(q6_r)); \
+ }
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
new file mode 100644
index 000000000..8db3e521f
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+ uint32_t p1, uint32_t p0, uint32_t p3,
+ uint32_t p2, uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3,
+ uint32_t thresh, uint32_t *hev,
+ uint32_t *mask) {
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
+
+ __asm__ __volatile__(
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh));
+
+ __asm__ __volatile__(
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
+
+ *hev = hev1;
+ *mask = s2;
+}
+
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+ uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+ uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+ uint32_t c, r, r3, r_k, r_flat;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t hev1;
+ uint32_t flat1;
+
+ __asm__ __volatile__(
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ * flat |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ * flat |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ /* look at stall here */
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+ [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+ __asm__ __volatile__(
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
+
+ *hev = hev1;
+ *mask = s2;
+ *flat = flat1;
+}
+
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t q4, uint32_t *flat2) {
+ uint32_t c, r, r_k, r_flat;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t flat1, flat3;
+
+ __asm__ __volatile__(
+ /* flat |= (abs(p4 - p0) > thresh) */
+ "subu_s.qb %[c], %[p4], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* flat |= (abs(q4 - q0) > thresh) */
+ "subu_s.qb %[c], %[q4], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+ "wrdsp %[r] \n\t"
+ "pick.qb %[flat3], $0, %[ones] \n\t"
+
+ /* flat |= (abs(p1 - p0) > thresh) */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* flat |= (abs(q1 - q0) > thresh) */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+ /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+ "and %[flat1], %[flat3], %[flat1] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+ [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+ [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+ *flat2 = flat1;
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
new file mode 100644
index 000000000..a3b5a9eb1
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint32_t mask;
+ uint32_t hev, flat;
+ uint8_t i;
+ uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ for (i = 0; i < 2; i++) {
+ sp3 = s - (pitch << 2);
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__(
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__(
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if ((flat != 0) && (mask != 0)) {
+ /* filtering */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat != 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
new file mode 100644
index 000000000..8d2fd69f7
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ uint32_t mask;
+ uint32_t hev, flat, flat2;
+ uint8_t i;
+ uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+ uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ for (i = 0; i < (2 * count); i++) {
+ sp7 = s - (pitch << 3);
+ sp6 = sp7 + pitch;
+ sp5 = sp6 + pitch;
+ sp4 = sp5 + pitch;
+ sp3 = sp4 + pitch;
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+ sq4 = sq3 + pitch;
+ sq5 = sq4 + pitch;
+ sq6 = sq5 + pitch;
+ sq7 = sq6 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p7], (%[sp7]) \n\t"
+ "lw %[p6], (%[sp6]) \n\t"
+ "lw %[p5], (%[sp5]) \n\t"
+ "lw %[p4], (%[sp4]) \n\t"
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
+
+ __asm__ __volatile__(
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+ "lw %[q4], (%[sq4]) \n\t"
+ "lw %[q5], (%[sq5]) \n\t"
+ "lw %[q6], (%[sq6]) \n\t"
+ "lw %[q7], (%[sq7]) \n\t"
+
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__(
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+ COMBINE_LEFT_RIGHT_3TO6()
+
+ __asm__ __volatile__(
+ "sw %[p6], (%[sp6]) \n\t"
+ "sw %[p5], (%[sp5]) \n\t"
+ "sw %[p4], (%[sp4]) \n\t"
+ "sw %[p3], (%[sp3]) \n\t"
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+
+ :
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+ [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sw %[q6], (%[sq6]) \n\t"
+ "sw %[q5], (%[sq5]) \n\t"
+ "sw %[q4], (%[sq4]) \n\t"
+ "sw %[q3], (%[sq3]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+
+ :
+ : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+ [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+ [sq1] "r"(sq1), [sq0] "r"(sq0));
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__(
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 + f2 */
+ /* f0 function */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* f1 function */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ /* f2 function */
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p6_r], (%[sp6]) \n\t"
+ "sb %[p5_r], (%[sp5]) \n\t"
+ "sb %[p4_r], (%[sp4]) \n\t"
+ "sb %[p3_r], (%[sp3]) \n\t"
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+ "sb %[q3_r], (%[sq3]) \n\t"
+ "sb %[q4_r], (%[sq4]) \n\t"
+ "sb %[q5_r], (%[sq5]) \n\t"
+ "sb %[q6_r], (%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], (%[sp2]) \n\t"
+ "sb %[p1_r_f1], (%[sp1]) \n\t"
+ "sb %[p0_r_f1], (%[sp0]) \n\t"
+ "sb %[q0_r_f1], (%[sq0]) \n\t"
+ "sb %[q1_r_f1], (%[sq1]) \n\t"
+ "sb %[q2_r_f1], (%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+ [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+ [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p6_r], +1(%[sp6]) \n\t"
+ "sb %[p5_r], +1(%[sp5]) \n\t"
+ "sb %[p4_r], +1(%[sp4]) \n\t"
+ "sb %[p3_r], +1(%[sp3]) \n\t"
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+ "sb %[q3_r], +1(%[sq3]) \n\t"
+ "sb %[q4_r], +1(%[sq4]) \n\t"
+ "sb %[q5_r], +1(%[sq5]) \n\t"
+ "sb %[q6_r], +1(%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], +1(%[sp2]) \n\t"
+ "sb %[p1_r_f1], +1(%[sp1]) \n\t"
+ "sb %[p0_r_f1], +1(%[sp0]) \n\t"
+ "sb %[q0_r_f1], +1(%[sq0]) \n\t"
+ "sb %[q1_r_f1], +1(%[sq1]) \n\t"
+ "sb %[q2_r_f1], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], +2(%[sp6]) \n\t"
+ "sb %[p5_l], +2(%[sp5]) \n\t"
+ "sb %[p4_l], +2(%[sp4]) \n\t"
+ "sb %[p3_l], +2(%[sp3]) \n\t"
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+ "sb %[q3_l], +2(%[sq3]) \n\t"
+ "sb %[q4_l], +2(%[sq4]) \n\t"
+ "sb %[q5_l], +2(%[sq5]) \n\t"
+ "sb %[q6_l], +2(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], +2(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +2(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +2(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +2(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +2(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], +3(%[sp6]) \n\t"
+ "sb %[p5_l], +3(%[sp5]) \n\t"
+ "sb %[p4_l], +3(%[sp4]) \n\t"
+ "sb %[p3_l], +3(%[sp3]) \n\t"
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+ "sb %[q3_l], +3(%[sq3]) \n\t"
+ "sb %[q4_l], +3(%[sq4]) \n\t"
+ "sb %[q5_l], +3(%[sq5]) \n\t"
+ "sb %[q6_l], +3(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], +3(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +3(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +3(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +3(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +3(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
new file mode 100644
index 000000000..28528869b
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat, flat2;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[p4], -8(%[s1]) \n\t"
+ "lw %[p5], -8(%[s2]) \n\t"
+ "lw %[p6], -8(%[s3]) \n\t"
+ "lw %[p7], -8(%[s4]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+ "lw %[q7], +4(%[s1]) \n\t"
+ "lw %[q6], +4(%[s2]) \n\t"
+ "lw %[q5], +4(%[s3]) \n\t"
+ "lw %[q4], +4(%[s4]) \n\t"
+
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p7, p6, p5, p4
+ original (when loaded from memory)
+ register -8 -7 -6 -5
+ p4 p4_0 p4_1 p4_2 p4_3
+ p5 p5_0 p5_1 p5_2 p5_3
+ p6 p6_0 p6_1 p6_2 p6_3
+ p7 p7_0 p7_1 p7_2 p7_3
+
+ after transpose
+ register
+ p4 p7_3 p6_3 p5_3 p4_3
+ p5 p7_2 p6_2 p5_2 p4_2
+ p6 p7_1 p6_1 p5_1 p4_1
+ p7 p7_0 p6_0 p5_0 p4_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
+ "precr.qb.ph %[prim4], %[p6], %[p7] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p4], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p6], %[p7], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p7], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+ [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q4, q5, q6, q7
+ original (when loaded from memory)
+ register +5 +6 +7 +8
+ q7 q7_0 q7_1 q7_2 q7_3
+ q6 q6_0 q6_1 q6_2 q6_3
+ q5 q5_0 q5_1 q5_2 q5_3
+ q4 q4_0 q4_1 q4_2 q4_3
+
+ after transpose
+ register
+ q7 q4_3 q5_3 q26_3 q7_3
+ q6 q4_2 q5_2 q26_2 q7_2
+ q5 q4_1 q5_1 q26_1 q7_1
+ q4 q4_0 q5_0 q26_0 q7_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
+ "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
+ "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
+ "precr.qb.ph %[prim4], %[q5], %[q4] \n\t"
+
+ "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q7], %[q6], %[sec3] \n\t"
+ "precrq.ph.w %[q5], %[q4], %[sec4] \n\t"
+ "append %[q6], %[sec3], 16 \n\t"
+ "append %[q4], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+ [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ STORE_F2()
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1+f2 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ PACK_LEFT_0TO3()
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ PACK_RIGHT_0TO3()
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p6_r], -7(%[s4]) \n\t"
+ "sb %[p5_r], -6(%[s4]) \n\t"
+ "sb %[p4_r], -5(%[s4]) \n\t"
+ "sb %[p3_r], -4(%[s4]) \n\t"
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+ "sb %[q3_r], +3(%[s4]) \n\t"
+ "sb %[q4_r], +4(%[s4]) \n\t"
+ "sb %[q5_r], +5(%[s4]) \n\t"
+ "sb %[q6_r], +6(%[s4]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s4] "r"(s4));
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], -3(%[s4]) \n\t"
+ "sb %[p1_r_f1], -2(%[s4]) \n\t"
+ "sb %[p0_r_f1], -1(%[s4]) \n\t"
+ "sb %[q0_r_f1], (%[s4]) \n\t"
+ "sb %[q1_r_f1], +1(%[s4]) \n\t"
+ "sb %[q2_r_f1], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+ [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+ [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p6_r], -7(%[s3]) \n\t"
+ "sb %[p5_r], -6(%[s3]) \n\t"
+ "sb %[p4_r], -5(%[s3]) \n\t"
+ "sb %[p3_r], -4(%[s3]) \n\t"
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s3] "r"(s3));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+ "sb %[q3_r], +3(%[s3]) \n\t"
+ "sb %[q4_r], +4(%[s3]) \n\t"
+ "sb %[q5_r], +5(%[s3]) \n\t"
+ "sb %[q6_r], +6(%[s3]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s3] "r"(s3));
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], -3(%[s3]) \n\t"
+ "sb %[p1_r_f1], -2(%[s3]) \n\t"
+ "sb %[p0_r_f1], -1(%[s3]) \n\t"
+ "sb %[q0_r_f1], (%[s3]) \n\t"
+ "sb %[q1_r_f1], +1(%[s3]) \n\t"
+ "sb %[q2_r_f1], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], -7(%[s2]) \n\t"
+ "sb %[p5_l], -6(%[s2]) \n\t"
+ "sb %[p4_l], -5(%[s2]) \n\t"
+ "sb %[p3_l], -4(%[s2]) \n\t"
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s2] "r"(s2));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+ "sb %[q3_l], +3(%[s2]) \n\t"
+ "sb %[q4_l], +4(%[s2]) \n\t"
+ "sb %[q5_l], +5(%[s2]) \n\t"
+ "sb %[q6_l], +6(%[s2]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s2] "r"(s2));
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], -3(%[s2]) \n\t"
+ "sb %[p1_l_f1], -2(%[s2]) \n\t"
+ "sb %[p0_l_f1], -1(%[s2]) \n\t"
+ "sb %[q0_l_f1], (%[s2]) \n\t"
+ "sb %[q1_l_f1], +1(%[s2]) \n\t"
+ "sb %[q2_l_f1], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], -7(%[s1]) \n\t"
+ "sb %[p5_l], -6(%[s1]) \n\t"
+ "sb %[p4_l], -5(%[s1]) \n\t"
+ "sb %[p3_l], -4(%[s1]) \n\t"
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s1] "r"(s1));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], 1(%[s1]) \n\t"
+ "sb %[q2_l], 2(%[s1]) \n\t"
+ "sb %[q3_l], 3(%[s1]) \n\t"
+ "sb %[q4_l], 4(%[s1]) \n\t"
+ "sb %[q5_l], 5(%[s1]) \n\t"
+ "sb %[q6_l], 6(%[s1]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s1] "r"(s1));
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], -3(%[s1]) \n\t"
+ "sb %[p1_l_f1], -2(%[s1]) \n\t"
+ "sb %[p0_l_f1], -1(%[s1]) \n\t"
+ "sb %[q0_l_f1], (%[s1]) \n\t"
+ "sb %[q1_l_f1], +1(%[s1]) \n\t"
+ "sb %[q2_l_f1], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
new file mode 100644
index 000000000..450594262
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_LOOPFILTER_MSA_H_
+#define AOM_DSP_LOOPFILTER_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+ p1_out, p0_out, q0_out, q1_out) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
+ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
+ v8i16 q0_sub_p0_r, filt_r, cnst3h; \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ filt = filt & (v16i8)hev_in; \
+ q0_sub_p0 = q0_m - p0_m; \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ \
+ cnst3h = __msa_ldi_h(3); \
+ q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
+ filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
+ filt_r += q0_sub_p0_r; \
+ filt_r = __msa_sat_s_h(filt_r, 7); \
+ \
+ /* combine left and right part */ \
+ filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
+ \
+ filt = filt & (v16i8)mask_in; \
+ cnst4b = __msa_ldi_b(4); \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 >>= 3; \
+ \
+ cnst3b = __msa_ldi_b(3); \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 >>= 3; \
+ \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
+ \
+ filt = __msa_srari_b(filt1, 1); \
+ hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
+
+#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+ p1_out, p0_out, q0_out, q1_out) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
+ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
+ v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q0_sub_p0 = q0_m - p0_m; \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ \
+ cnst3h = __msa_ldi_h(3); \
+ q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
+ filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
+ filt_r += q0_sub_p0_r; \
+ filt_r = __msa_sat_s_h(filt_r, 7); \
+ \
+ q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
+ q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
+ filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
+ filt_l += q0_sub_p0_l; \
+ filt_l = __msa_sat_s_h(filt_l, 7); \
+ \
+ filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
+ filt = filt & (v16i8)mask_in; \
+ \
+ cnst4b = __msa_ldi_b(4); \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 >>= 3; \
+ \
+ cnst3b = __msa_ldi_b(3); \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 >>= 3; \
+ \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
+ \
+ filt = __msa_srari_b(filt1, 1); \
+ hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
+ filt = filt & (v16i8)hev_in; \
+ \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
+
+#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+ { \
+ v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+ v16u8 zero_in = { 0 }; \
+ \
+ tmp_flat4 = __msa_ori_b(zero_in, 1); \
+ p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
+ q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
+ p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
+ q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
+ \
+ p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
+ flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
+ p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
+ flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
+ \
+ flat_out = (tmp_flat4 < (v16u8)flat_out); \
+ flat_out = __msa_xori_b(flat_out, 0xff); \
+ flat_out = flat_out & (mask); \
+ }
+
+#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+ q6_in, q7_in, flat_in, flat2_out) \
+ { \
+ v16u8 tmp_flat5, zero_in = { 0 }; \
+ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
+ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
+ \
+ tmp_flat5 = __msa_ori_b(zero_in, 1); \
+ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
+ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
+ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
+ q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
+ p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
+ q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
+ p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
+ q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
+ \
+ p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
+ p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
+ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
+ \
+ flat2_out = (tmp_flat5 < (v16u8)flat2_out); \
+ flat2_out = __msa_xori_b(flat2_out, 0xff); \
+ flat2_out = flat2_out & flat_in; \
+ }
+
+#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+ q1_filt8_out, q2_filt8_out) \
+ { \
+ v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
+ \
+ tmp_filt8_2 = p2_in + p1_in + p0_in; \
+ tmp_filt8_0 = p3_in << 1; \
+ \
+ tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \
+ tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \
+ p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \
+ p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = q2_in + q1_in + q0_in; \
+ tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \
+ tmp_filt8_0 = tmp_filt8_2 + (p0_in); \
+ tmp_filt8_0 = tmp_filt8_0 + (p3_in); \
+ p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \
+ \
+ tmp_filt8_0 = q2_in + q3_in; \
+ tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \
+ tmp_filt8_1 = q3_in + q3_in; \
+ tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \
+ q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_0 = tmp_filt8_2 + q3_in; \
+ tmp_filt8_1 = tmp_filt8_0 + q0_in; \
+ q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = tmp_filt8_0 - p2_in; \
+ tmp_filt8_0 = q1_in + q3_in; \
+ tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \
+ q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ { \
+ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
+ p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
+ p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
+ q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
+ q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
+ q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
+ p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
+ p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
+ \
+ /* calculation of hev */ \
+ flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = thresh_in < (v16u8)flat_out; \
+ \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m >>= 1; \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
+ \
+ mask_out = b_limit_in < p0_asub_q0_m; \
+ mask_out = __msa_max_u_b(flat_out, mask_out); \
+ p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
+ \
+ mask_out = limit_in < (v16u8)mask_out; \
+ mask_out = __msa_xori_b(mask_out, 0xff); \
+ }
+#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
new file mode 100644
index 000000000..48fbcfd47
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -0,0 +1,2057 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_MACROS_MSA_H_
+#define AOM_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
+ \
+ __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_m); \
+ val1_m = LW(psrc_m + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint64_t val_m = (val); \
+ \
+ __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+#else // !(__mips_isa_rev >= 6)
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
+ \
+ __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m_combined = 0; \
+ \
+ val0_m = LW(psrc_m1); \
+ val1_m = LW(psrc_m1 + 4); \
+ \
+ val_m_combined = (uint64_t)(val1_m); \
+ val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
+ val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \
+ \
+ val_m_combined; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m1 = (uint8_t *)(pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_m1); \
+ SW(val1_m, pdst_m1 + 4); \
+ }
+#endif // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1, out2, out3
+ Details : Load word in 'out0' from (psrc)
+ Load word in 'out1' from (psrc + stride)
+ Load word in 'out2' from (psrc + 2 * stride)
+ Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ out0 = LW((psrc)); \
+ out1 = LW((psrc) + stride); \
+ out2 = LW((psrc) + 2 * stride); \
+ out3 = LW((psrc) + 3 * stride); \
+ }
+
+/* Description : Load double words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load double word in 'out0' from (psrc)
+ Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+ }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+ }
+
+/* Description : Store 4 words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store word from 'in0' to (pdst)
+ Store word from 'in1' to (pdst + stride)
+ Store word from 'in2' to (pdst + 2 * stride)
+ Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SW(in0, (pdst)) \
+ SW(in1, (pdst) + stride); \
+ SW(in2, (pdst) + 2 * stride); \
+ SW(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Store 4 double words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store double word from 'in0' to (pdst)
+ Store double word from 'in1' to (pdst + stride)
+ Store double word from 'in2' to (pdst + 2 * stride)
+ Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SD(in0, (pdst)) \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Load vectors with 16 byte elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Load 16 byte elements in 'out0' from (psrc)
+ Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+ }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
+ }
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
+ }
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+ { \
+ LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
+ LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
+ }
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load 8 halfword elements in 'out0' from (psrc)
+ Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_H(RTYPE, (psrc)); \
+ out1 = LD_H(RTYPE, (psrc) + (stride)); \
+ }
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_H2(RTYPE, (psrc), stride, out0, out1); \
+ LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+ { \
+ LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7); \
+ LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+ out13, out14, out15); \
+ }
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+ data into 4 vectors (Each vector with 4 signed halfwords)
+ Arguments : Input - psrc
+ Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) \
+ { \
+ out0 = LD_SH(psrc); \
+ out2 = LD_SH(psrc + 8); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ }
+
+/* Description : Load 2 vectors of signed word elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD_SW((psrc)); \
+ out1 = LD_SW((psrc) + stride); \
+ }
+
+/* Description : Store vectors of 16 byte elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_H2(RTYPE, in0, in1, (pdst), stride); \
+ ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
+ ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 word elements from 'in0' to (pdst)
+ Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) \
+ { \
+ ST_SW(in0, (pdst)); \
+ ST_SW(in1, (pdst) + stride); \
+ }
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ Arguments : Inputs - in, stidx, pdst, stride
+ Details : Index 'stidx' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst)
+ Index 'stidx+1' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + stride)
+ Index 'stidx+2' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 2 * stride)
+ Index 'stidx+3' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) \
+ { \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
+ out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+ out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+ out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+ \
+ SH(out0_m, pblk_2x4_m); \
+ SH(out1_m, pblk_2x4_m + stride); \
+ SH(out2_m, pblk_2x4_m + 2 * stride); \
+ SH(out3_m, pblk_2x4_m + 3 * stride); \
+ }
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 word element from 'in' vector is copied to the GP
+ register and stored to (pdst)
+ Index 1 word element from 'in' vector is copied to the GP
+ register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m; \
+ uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in, 0); \
+ out1_m = __msa_copy_u_w((v4i32)in, 1); \
+ \
+ SW(out0_m, pblk_4x2_m); \
+ SW(out1_m, pblk_4x2_m + stride); \
+ }
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : 'Idx0' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst)
+ 'Idx1' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + stride)
+ 'Idx2' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ 'Idx3' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
+ out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
+ out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
+ out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
+ \
+ SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+ }
+#define ST4x8_UB(in0, in1, pdst, stride) \
+ { \
+ uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
+ \
+ ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
+ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+ }
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) \
+ { \
+ uint64_t out0_m; \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ SD(out0_m, pdst); \
+ }
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+ }
+
+/* Description : Store 8x4 byte block to destination memory from input
+ vectors
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Index 0 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst + stride)
+ Index 0 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ Index 1 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+ }
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+ Arguments : Inputs - in0, in1, in2, in3,
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from 'in0' vector is added with
+ each unsigned byte element from 'in1' vector. Then the average
+ with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+ out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+ }
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
+ }
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+ Arguments : Inputs - in0, in1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'zero_m' vector are slid into 'in0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+ }
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+ slide_val) \
+ { \
+ SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
+ SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
+ }
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+ Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ { \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
+ }
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+ out2, slide_val) \
+ { \
+ SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
+ }
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+ out3) \
+ { \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
+ }
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Unsigned byte elements from 'mult0' are multiplied with
+ unsigned byte elements from 'cnst0' producing a result
+ twice the size of input i.e. unsigned halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
+ }
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
+ }
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed word elements from 'mult0' are multiplied with
+ signed word elements from 'cnst0' producing a result
+ twice the size of input i.e. signed double word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+ }
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+ }
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+ Arguments : Inputs - mult0, mult1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed word element from 'mult0' is multiplied with itself
+ producing an intermediate result twice the size of input
+ i.e. signed double word
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+ out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+ }
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+ either vector are copied to the output vector
+ Arguments : Inputs - in0, in1, min_vec
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Minimum of unsigned halfword element values from 'in0' and
+ 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) \
+ { \
+ in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+ in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+ }
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+ { \
+ MIN_UH2(RTYPE, in0, in1, min_vec); \
+ MIN_UH2(RTYPE, in2, in3, min_vec); \
+ }
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+ between 0 & 255
+ Arguments : Input - in
+ Output - out_m
+ Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) \
+ ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)in, 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+ })
+#define CLIP_SH2_0_255(in0, in1) \
+ { \
+ in0 = CLIP_SH_0_255(in0); \
+ in1 = CLIP_SH_0_255(in1); \
+ }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+ { \
+ CLIP_SH2_0_255(in0, in1); \
+ CLIP_SH2_0_255(in2, in3); \
+ }
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+ Arguments : Input - in (signed word vector)
+ Output - sum_m (i32 sum)
+ Return Type - signed word (GP)
+ Details : 4 signed word elements of 'in' vector are added together and
+ the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in) \
+ ({ \
+ v2i64 res0_m, res1_m; \
+ int32_t sum_m; \
+ \
+ res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+ res1_m = __msa_splati_d(res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ Arguments : Inputs - in (unsigned halfword vector)
+ Outputs - sum_m (u32 sum)
+ Return Type - unsigned word
+ Details : 8 unsigned halfword elements of input vector are added
+ together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) \
+ ({ \
+ v4u32 res_m; \
+ v2u64 res0_m, res1_m; \
+ uint32_t sum_m; \
+ \
+ res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+ res0_m = __msa_hadd_u_d(res_m, res_m); \
+ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
+
+/* Description : Horizontal addition of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is added to
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ HADD_UB2(RTYPE, in0, in1, out0, out1); \
+ HADD_UB2(RTYPE, in2, in3, out2, out3); \
+ }
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is subtracted from
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+ Arguments : Inputs - in0, in1, ref0, ref1
+ Outputs - sad_m (halfword vector)
+ Return Type - unsigned halfword
+ Details : Absolute difference of all the byte elements from 'in0' with
+ 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+ pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1) \
+ ({ \
+ v16u8 diff0_m, diff1_m; \
+ v8u16 sad_m = { 0 }; \
+ \
+ diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
+ diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
+ \
+ sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+ sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+ \
+ sad_m; \
+ })
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed odd halfword element from 'in0' is subtracted from
+ even signed halfword element from 'in0' (pairwise) and the
+ word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+ }
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+ Arguments : Inputs - in0, in1, in2, in3
+ Output - out
+ Return Type - as per RTYPE
+ Details : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ }
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+ }
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+ }
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ }
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+ }
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+ out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+ }
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+ out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+ }
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements of 'in0' and 'in1' are interleaved
+ and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \
+ out5, out6, out7) \
+ { \
+ ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3); \
+ ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \
+ out6, out7); \
+ }
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of double word elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+ out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+ }
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
+ }
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range.
+ The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+ }
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_UH2(RTYPE, in0, in1, sat_val); \
+ SAT_UH2(RTYPE, in2, in3, sat_val) \
+ }
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range
+ The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+ }
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_SH2(RTYPE, in0, in1, sat_val); \
+ SAT_SH2(RTYPE, in2, in3, sat_val); \
+ }
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, idx0, idx1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'idx0' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
+ out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
+ }
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+ { \
+ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
+ SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
+ }
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' are copied to the left half of
+ 'out0' & even byte elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+ }
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' are copied to the left half of
+ 'out0' & even halfword elements of 'in1' are copied to the
+ right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+ }
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double elements of 'in0' are copied to the left half of
+ 'out0' & even double elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+ out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+ }
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+ Arguments : Inputs - in0, in1
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from input vector 'in0' is
+ logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) \
+ { \
+ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+ in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+ }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+ }
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ XORI_B2_128(RTYPE, in2, in3); \
+ }
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+ { \
+ XORI_B4_128(RTYPE, in0, in1, in2, in3); \
+ XORI_B3_128(RTYPE, in4, in5, in6); \
+ }
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each signed halfword element from 'in0' is added to each
+ signed halfword element of 'in1' with full precision resulting
+ in one extra bit in the result. The result is then divided by
+ 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
+ out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
+ out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
+ }
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'in0' are added to signed
+ halfword elements of 'in1'. The result is then signed saturated
+ between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is left shifted by 'shift' and
+ the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+ }
+
+/* Description : Arithmetic shift right all elements of vector
+ (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ in2 = in2 >> shift; \
+ in3 = in3 >> shift; \
+ }
+
+/* Description : Shift right arithmetic rounded words
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the number of bits in the corresponding element in the vector
+ 'shift'. The last discarded bit is added to shifted value for
+ rounding and the result is written in-place.
+ 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+ }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRAR_W2(RTYPE, in0, in1, shift) \
+ SRAR_W2(RTYPE, in2, in3, shift) \
+ }
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the value in 'shift'. The last discarded bit is added to the
+ shifted value for rounding and the result is written in-place.
+ 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+ in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+ }
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_H2(RTYPE, in0, in1, shift); \
+ SRARI_H2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+ }
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_W2(RTYPE, in0, in1, shift); \
+ SRARI_W2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+ { \
+ out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
+ out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
+ out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
+ out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
+ }
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Addition of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Subtraction of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in1' is subtracted from 'in0' and result is
+ written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ out2 = in4 - in5; \
+ out3 = in6 - in7; \
+ }
+
+/* Description : Sign extend halfword elements from right half of the vector
+ Arguments : Input - in (halfword vector)
+ Output - out (sign extended word vector)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved with same vector 'in0' to generate
+ 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) \
+ { \
+ v8i16 sign_m; \
+ \
+ sign_m = __msa_clti_s_h((v8i16)in, 0); \
+ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+ }
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+ Arguments : Input - in (unsigned byte vector)
+ Outputs - out0, out1 (unsigned halfword vectors)
+ Return Type - signed halfword
+ Details : Zero extended right half of vector is returned in 'out0'
+ Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVRL_B2_SH(zero_m, in, out0, out1); \
+ }
+
+/* Description : Sign extend halfword elements from input vector and return
+ the result in pair of vectors
+ Arguments : Input - in (halfword vector)
+ Outputs - out0, out1 (sign extended word vectors)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 4 signed word elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) \
+ { \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+ ILVRL_H2_SW(tmp_m, in, out0, out1); \
+ }
+
+/* Description : Butterfly of 4 input vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = in0 + in3; \
+ out1 = in1 + in2; \
+ \
+ out2 = in1 - in2; \
+ out3 = in0 - in3; \
+ }
+
+/* Description : Butterfly of 8 input vectors
+ Arguments : Inputs - in0 ... in7
+ Outputs - out0 .. out7
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ out0 = in0 + in7; \
+ out1 = in1 + in6; \
+ out2 = in2 + in5; \
+ out3 = in3 + in4; \
+ \
+ out4 = in3 - in4; \
+ out5 = in2 - in5; \
+ out6 = in1 - in6; \
+ out7 = in0 - in7; \
+ }
+
+/* Description : Butterfly of 16 input vectors
+ Arguments : Inputs - in0 ... in15
+ Outputs - out0 .. out15
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, \
+ out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ out0 = in0 + in15; \
+ out1 = in1 + in14; \
+ out2 = in2 + in13; \
+ out3 = in3 + in12; \
+ out4 = in4 + in11; \
+ out5 = in5 + in10; \
+ out6 = in6 + in9; \
+ out7 = in7 + in8; \
+ \
+ out8 = in7 - in8; \
+ out9 = in6 - in9; \
+ out10 = in5 - in10; \
+ out11 = in4 - in11; \
+ out12 = in3 - in12; \
+ out13 = in2 - in13; \
+ out14 = in1 - in14; \
+ out15 = in0 - in15; \
+ }
+
+/* Description : Transpose input 8x8 byte block
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+ tmp3_m); \
+ ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
+ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
+ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
+ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
+ SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
+ SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
+ }
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
+ ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
+ ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
+ ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
+ \
+ tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
+ tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
+ tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
+ tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
+ out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
+ tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
+ out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
+ tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
+ \
+ ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
+ out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
+ out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
+ out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ }
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s0_m, s1_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
+ ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
+ }
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
+ v8i16 zero_m = { 0 }; \
+ \
+ ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+ tmp3_n); \
+ ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
+ ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
+ \
+ out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ \
+ out4 = zero_m; \
+ out5 = zero_m; \
+ out6 = zero_m; \
+ out7 = zero_m; \
+ }
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+ }
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
+ ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
+ ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
+ PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+ tmp7_m, out0, out2, out4, out6); \
+ out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+ }
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ \
+ out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
+ out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
+ out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
+ out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
+ }
+
+/* Description : Add block 4x4
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Least significant 4 bytes from each input vector are added to
+ the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
+ { \
+ uint32_t src0_m, src1_m, src2_m, src3_m; \
+ v8i16 inp0_m, inp1_m, res0_m, res1_m; \
+ v16i8 dst0_m = { 0 }; \
+ v16i8 dst1_m = { 0 }; \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
+ LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
+ INSERT_W2_SB(src0_m, src1_m, dst0_m); \
+ INSERT_W2_SB(src2_m, src3_m, dst1_m); \
+ ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
+ CLIP_SH2_0_255(res0_m, res1_m); \
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+ ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
+ }
+
+/* Description : Pack even elements of input vectors & xor with 128
+ Arguments : Inputs - in0, in1
+ Output - out_m
+ Return Type - unsigned byte
+ Details : Signed byte even elements from 'in0' and 'in1' are packed
+ together in one vector and the resulting vector is xor'ed with
+ 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) \
+ ({ \
+ v16u8 out_m; \
+ \
+ out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
+ out_m; \
+ })
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+ as 8x4 unsigned byte block
+ Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
+ pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ tmp0_m = PCKEV_XORI128_UB(in0, in1); \
+ tmp1_m = PCKEV_XORI128_UB(in2, in3); \
+ ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
+ }
+
+/* Description : Pack even byte elements and store byte vector in destination
+ memory
+ Arguments : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) \
+ { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ ST_SB(tmp_m, (pdst)); \
+ }
+
+/* Description : Horizontal 2 tap filter kernel code
+ Arguments : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
+ ({ \
+ v16i8 tmp0_m; \
+ v8u16 tmp1_m; \
+ \
+ tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+ tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
+ tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
+ \
+ tmp1_m; \
+ })
+#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
new file mode 100644
index 000000000..258eb5c07
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/sad_msa.c
@@ -0,0 +1,1529 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+ }
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t sad = 0;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = HADD_UH_U32(sad0);
+ sad += HADD_UH_U32(sad1);
+
+ return sad;
+}
+
+static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 ref0, ref1, ref2, ref3, diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+ ref += (4 * ref_stride);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+ ref0, ref1);
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src, ref, ref0, ref1, diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+
+ for (ht_cnt = height >> 1; ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+ ref += ref_stride;
+
+ sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+ ref += ref_stride;
+
+ sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
+ v8u16 sad0_0 = { 0 };
+ v8u16 sad0_1 = { 0 };
+ v8u16 sad1_0 = { 0 };
+ v8u16 sad1_1 = { 0 };
+ v8u16 sad2_0 = { 0 };
+ v8u16 sad2_1 = { 0 };
+ v4u32 sad;
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
+ ref0_4 = LD_UB(ref + 64);
+ ref += ref_stride;
+
+ sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+ sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+ sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+ sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = __msa_hadd_u_w(sad0_0, sad0_0);
+ sad += __msa_hadd_u_w(sad0_1, sad0_1);
+ sad_array[0] = HADD_SW_S32((v4i32)sad);
+
+ sad = __msa_hadd_u_w(sad1_0, sad1_0);
+ sad += __msa_hadd_u_w(sad1_1, sad1_1);
+ sad_array[1] = HADD_SW_S32((v4i32)sad);
+
+ sad = __msa_hadd_u_w(sad2_0, sad2_0);
+ sad += __msa_hadd_u_w(sad2_1, sad2_1);
+ sad_array[2] = HADD_SW_S32((v4i32)sad);
+}
+
+static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3, diff;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+ v8u16 sad4 = { 0 };
+ v8u16 sad5 = { 0 };
+ v8u16 sad6 = { 0 };
+ v8u16 sad7 = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad3 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad4 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad5 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad6 += __msa_hadd_u_h(diff, diff);
+
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+ diff = __msa_asub_u_b(src, ref);
+ sad7 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[4] = HADD_UH_U32(sad4);
+ sad_array[5] = HADD_UH_U32(sad5);
+ sad_array[6] = HADD_UH_U32(sad6);
+ sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+ v8u16 sad4 = { 0 };
+ v8u16 sad5 = { 0 };
+ v8u16 sad6 = { 0 };
+ v8u16 sad7 = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+ ref += (4 * ref_stride);
+ PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+ ref0, ref1);
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+ SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+ PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+ sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[4] = HADD_UH_U32(sad4);
+ sad_array[5] = HADD_UH_U32(sad5);
+ sad_array[6] = HADD_UH_U32(sad6);
+ sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src, ref0, ref1, ref;
+ v16u8 diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+ v8u16 sad4 = { 0 };
+ v8u16 sad5 = { 0 };
+ v8u16 sad6 = { 0 };
+ v8u16 sad7 = { 0 };
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+ diff = __msa_asub_u_b(src, ref);
+ sad3 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+ diff = __msa_asub_u_b(src, ref);
+ sad4 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+ diff = __msa_asub_u_b(src, ref);
+ sad5 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+ diff = __msa_asub_u_b(src, ref);
+ sad6 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+ diff = __msa_asub_u_b(src, ref);
+ sad7 += __msa_hadd_u_h(diff, diff);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+ diff = __msa_asub_u_b(src, ref);
+ sad3 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+ diff = __msa_asub_u_b(src, ref);
+ sad4 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+ diff = __msa_asub_u_b(src, ref);
+ sad5 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+ diff = __msa_asub_u_b(src, ref);
+ sad6 += __msa_hadd_u_h(diff, diff);
+
+ ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+ diff = __msa_asub_u_b(src, ref);
+ sad7 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[4] = HADD_UH_U32(sad4);
+ sad_array[5] = HADD_UH_U32(sad5);
+ sad_array[6] = HADD_UH_U32(sad6);
+ sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ int32_t ht_cnt;
+ v16u8 src0, src1;
+ v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+ v8u16 sad4 = { 0 };
+ v8u16 sad5 = { 0 };
+ v8u16 sad6 = { 0 };
+ v8u16 sad7 = { 0 };
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+ ref += ref_stride;
+
+ sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+ sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+ sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+ sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+ sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[4] = HADD_UH_U32(sad4);
+ sad_array[5] = HADD_UH_U32(sad5);
+ sad_array[6] = HADD_UH_U32(sad6);
+ sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, uint32_t *sad_array) {
+ const uint8_t *src_dup, *ref_dup;
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 sad0_0 = { 0 };
+ v8u16 sad0_1 = { 0 };
+ v8u16 sad1_0 = { 0 };
+ v8u16 sad1_1 = { 0 };
+ v8u16 sad2_0 = { 0 };
+ v8u16 sad2_1 = { 0 };
+ v8u16 sad3_0 = { 0 };
+ v8u16 sad3_1 = { 0 };
+ v4u32 sad;
+
+ src_dup = src;
+ ref_dup = ref;
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+ ref += ref_stride;
+
+ sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+ sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+ sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+ sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
+ sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = __msa_hadd_u_w(sad0_0, sad0_0);
+ sad += __msa_hadd_u_w(sad0_1, sad0_1);
+ sad_array[0] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad1_0, sad1_0);
+ sad += __msa_hadd_u_w(sad1_1, sad1_1);
+ sad_array[1] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad2_0, sad2_0);
+ sad += __msa_hadd_u_w(sad2_1, sad2_1);
+ sad_array[2] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad3_0, sad3_0);
+ sad += __msa_hadd_u_w(sad3_1, sad3_1);
+ sad_array[3] = HADD_SW_S32(sad);
+
+ sad0_0 = (v8u16)__msa_ldi_h(0);
+ sad0_1 = (v8u16)__msa_ldi_h(0);
+ sad1_0 = (v8u16)__msa_ldi_h(0);
+ sad1_1 = (v8u16)__msa_ldi_h(0);
+ sad2_0 = (v8u16)__msa_ldi_h(0);
+ sad2_1 = (v8u16)__msa_ldi_h(0);
+ sad3_0 = (v8u16)__msa_ldi_h(0);
+ sad3_1 = (v8u16)__msa_ldi_h(0);
+
+ for (ht_cnt = 64; ht_cnt--;) {
+ LD_UB4(src_dup, 16, src0, src1, src2, src3);
+ src_dup += src_stride;
+ LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+ ref_dup += ref_stride;
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
+ sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
+ sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
+ sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+ SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
+ sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = __msa_hadd_u_w(sad0_0, sad0_0);
+ sad += __msa_hadd_u_w(sad0_1, sad0_1);
+ sad_array[4] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad1_0, sad1_0);
+ sad += __msa_hadd_u_w(sad1_1, sad1_1);
+ sad_array[5] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad2_0, sad2_0);
+ sad += __msa_hadd_u_w(sad2_1, sad2_1);
+ sad_array[6] = HADD_SW_S32(sad);
+
+ sad = __msa_hadd_u_w(sad3_0, sad3_0);
+ sad += __msa_hadd_u_w(sad3_1, sad3_1);
+ sad_array[7] = HADD_SW_S32(sad);
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ src_ptr += (4 * src_stride);
+
+ LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref0_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref1_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref2_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref3_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad3 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt;
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref0_ptr += (4 * ref_stride);
+ LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+ ref1_ptr += (4 * ref_stride);
+ LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+ ref2_ptr += (4 * ref_stride);
+ LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+ ref3_ptr += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt;
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ v16u8 src, ref0, ref1, ref2, ref3, diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref0 = LD_UB(ref0_ptr);
+ ref0_ptr += ref_stride;
+ ref1 = LD_UB(ref1_ptr);
+ ref1_ptr += ref_stride;
+ ref2 = LD_UB(ref2_ptr);
+ ref2_ptr += ref_stride;
+ ref3 = LD_UB(ref3_ptr);
+ ref3_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref1);
+ sad1 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref2);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref3);
+ sad3 += __msa_hadd_u_h(diff, diff);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref0 = LD_UB(ref0_ptr);
+ ref0_ptr += ref_stride;
+ ref1 = LD_UB(ref1_ptr);
+ ref1_ptr += ref_stride;
+ ref2 = LD_UB(ref2_ptr);
+ ref2_ptr += ref_stride;
+ ref3 = LD_UB(ref3_ptr);
+ ref3_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref1);
+ sad1 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref2);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref3);
+ sad3 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+
+ LD_UB2(ref0_ptr, 16, ref0, ref1);
+ ref0_ptr += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref1_ptr, 16, ref0, ref1);
+ ref1_ptr += ref_stride;
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref2_ptr, 16, ref0, ref1);
+ ref2_ptr += ref_stride;
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref3_ptr, 16, ref0, ref1);
+ ref3_ptr += ref_stride;
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 sad0_0 = { 0 };
+ v8u16 sad0_1 = { 0 };
+ v8u16 sad1_0 = { 0 };
+ v8u16 sad1_1 = { 0 };
+ v8u16 sad2_0 = { 0 };
+ v8u16 sad2_1 = { 0 };
+ v8u16 sad3_0 = { 0 };
+ v8u16 sad3_1 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+
+ LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+ ref0_ptr += ref_stride;
+ sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+ ref1_ptr += ref_stride;
+ sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+ ref2_ptr += ref_stride;
+ sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+ ref3_ptr += ref_stride;
+ sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0_0);
+ sad_array[0] += HADD_UH_U32(sad0_1);
+ sad_array[1] = HADD_UH_U32(sad1_0);
+ sad_array[1] += HADD_UH_U32(sad1_1);
+ sad_array[2] = HADD_UH_U32(sad2_0);
+ sad_array[2] += HADD_UH_U32(sad2_1);
+ sad_array[3] = HADD_UH_U32(sad3_0);
+ sad_array[3] += HADD_UH_U32(sad3_1);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff, pred, comp;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ comp = __msa_aver_u_b(pred, ref);
+ diff = __msa_asub_u_b(src, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 diff0, diff1, pred0, pred1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+ sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * 16);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * 16);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 comp0, comp1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+ LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+ ref += (4 * ref_stride);
+
+ LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+ LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+ sec_pred += (4 * 32);
+
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+ sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+ AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+ sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 comp0, comp1, comp2, comp3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v4u32 sad;
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+ }
+
+ sad = __msa_hadd_u_w(sad0, sad0);
+ sad += __msa_hadd_u_w(sad1, sad1);
+
+ return HADD_SW_S32(sad);
+}
+
+#define AOM_SAD_4xHEIGHT_MSA(height) \
+ uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define AOM_SAD_8xHEIGHT_MSA(height) \
+ uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define AOM_SAD_16xHEIGHT_MSA(height) \
+ uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define AOM_SAD_32xHEIGHT_MSA(height) \
+ uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define AOM_SAD_64xHEIGHT_MSA(height) \
+ uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define AOM_SAD_4xHEIGHTx3_MSA(height) \
+ void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_8xHEIGHTx3_MSA(height) \
+ void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_16xHEIGHTx3_MSA(height) \
+ void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_32xHEIGHTx3_MSA(height) \
+ void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_64xHEIGHTx3_MSA(height) \
+ void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_4xHEIGHTx8_MSA(height) \
+ void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_8xHEIGHTx8_MSA(height) \
+ void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_16xHEIGHTx8_MSA(height) \
+ void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_32xHEIGHTx8_MSA(height) \
+ void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_64xHEIGHTx8_MSA(height) \
+ void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sads) { \
+ sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_4xHEIGHTx4D_MSA(height) \
+ void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_8xHEIGHTx4D_MSA(height) \
+ void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_16xHEIGHTx4D_MSA(height) \
+ void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_32xHEIGHTx4D_MSA(height) \
+ void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define AOM_SAD_64xHEIGHTx4D_MSA(height) \
+ void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define AOM_AVGSAD_4xHEIGHT_MSA(height) \
+ uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define AOM_AVGSAD_8xHEIGHT_MSA(height) \
+ uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define AOM_AVGSAD_16xHEIGHT_MSA(height) \
+ uint32_t aom_sad16x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define AOM_AVGSAD_32xHEIGHT_MSA(height) \
+ uint32_t aom_sad32x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define AOM_AVGSAD_64xHEIGHT_MSA(height) \
+ uint32_t aom_sad64x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+/* clang-format off */
+// 64x64
+AOM_SAD_64xHEIGHT_MSA(64)
+AOM_SAD_64xHEIGHTx3_MSA(64)
+AOM_SAD_64xHEIGHTx8_MSA(64)
+AOM_SAD_64xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_64xHEIGHT_MSA(64)
+
+// 64x32
+AOM_SAD_64xHEIGHT_MSA(32)
+AOM_SAD_64xHEIGHTx3_MSA(32)
+AOM_SAD_64xHEIGHTx8_MSA(32)
+AOM_SAD_64xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_64xHEIGHT_MSA(32)
+
+// 32x64
+AOM_SAD_32xHEIGHT_MSA(64)
+AOM_SAD_32xHEIGHTx3_MSA(64)
+AOM_SAD_32xHEIGHTx8_MSA(64)
+AOM_SAD_32xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_32xHEIGHT_MSA(64)
+
+// 32x32
+AOM_SAD_32xHEIGHT_MSA(32)
+AOM_SAD_32xHEIGHTx3_MSA(32)
+AOM_SAD_32xHEIGHTx8_MSA(32)
+AOM_SAD_32xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_32xHEIGHT_MSA(32)
+
+// 32x16
+AOM_SAD_32xHEIGHT_MSA(16)
+AOM_SAD_32xHEIGHTx3_MSA(16)
+AOM_SAD_32xHEIGHTx8_MSA(16)
+AOM_SAD_32xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_32xHEIGHT_MSA(16)
+
+// 16x32
+AOM_SAD_16xHEIGHT_MSA(32)
+AOM_SAD_16xHEIGHTx3_MSA(32)
+AOM_SAD_16xHEIGHTx8_MSA(32)
+AOM_SAD_16xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_16xHEIGHT_MSA(32)
+
+// 16x16
+AOM_SAD_16xHEIGHT_MSA(16)
+AOM_SAD_16xHEIGHTx3_MSA(16)
+AOM_SAD_16xHEIGHTx8_MSA(16)
+AOM_SAD_16xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_16xHEIGHT_MSA(16)
+
+// 16x8
+AOM_SAD_16xHEIGHT_MSA(8)
+AOM_SAD_16xHEIGHTx3_MSA(8)
+AOM_SAD_16xHEIGHTx8_MSA(8)
+AOM_SAD_16xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_16xHEIGHT_MSA(8)
+
+// 8x16
+AOM_SAD_8xHEIGHT_MSA(16)
+AOM_SAD_8xHEIGHTx3_MSA(16)
+AOM_SAD_8xHEIGHTx8_MSA(16)
+AOM_SAD_8xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_8xHEIGHT_MSA(16)
+
+// 8x8
+AOM_SAD_8xHEIGHT_MSA(8)
+AOM_SAD_8xHEIGHTx3_MSA(8)
+AOM_SAD_8xHEIGHTx8_MSA(8)
+AOM_SAD_8xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_8xHEIGHT_MSA(8)
+
+// 8x4
+AOM_SAD_8xHEIGHT_MSA(4)
+AOM_SAD_8xHEIGHTx3_MSA(4)
+AOM_SAD_8xHEIGHTx8_MSA(4)
+AOM_SAD_8xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_8xHEIGHT_MSA(4)
+
+// 4x8
+AOM_SAD_4xHEIGHT_MSA(8)
+AOM_SAD_4xHEIGHTx3_MSA(8)
+AOM_SAD_4xHEIGHTx8_MSA(8)
+AOM_SAD_4xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_4xHEIGHT_MSA(8)
+
+// 4x4
+AOM_SAD_4xHEIGHT_MSA(4)
+AOM_SAD_4xHEIGHTx3_MSA(4)
+AOM_SAD_4xHEIGHTx8_MSA(4)
+AOM_SAD_4xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_4xHEIGHT_MSA(4)
+ /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 000000000..3eb85107d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -0,0 +1,1795 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+ }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t height,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 pred, src = { 0 };
+ v16u8 ref = { 0 };
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t height,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src, ref, pred;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1, pred0, pred1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1, pred0, pred1;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v8i16 avg2 = { 0 };
+ v8i16 avg3 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 32; ht_cnt--;) {
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ vec += __msa_hadd_s_w(avg2, avg2);
+ vec += __msa_hadd_s_w(avg3, avg3);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 filt0, ref = { 0 };
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+ src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+ CALC_MSE_AVG_B(src0, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 filt0, out, ref0, ref1, ref2, ref3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v16u8 dst0, dst1, dst2, dst3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, dst0, var, avg);
+ CALC_MSE_AVG_B(src1, dst1, var, avg);
+ CALC_MSE_AVG_B(src2, dst2, var, avg);
+ CALC_MSE_AVG_B(src3, dst3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4, out;
+ v16u8 src10_r, src32_r, src21_r, src43_r;
+ v16u8 ref = { 0 };
+ v16u8 src2110, src4332;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+ v8u16 tmp0, tmp1;
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1, out2, out3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ src0 = src4;
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out, ref = { 0 };
+ v16u8 filt_vt, filt_hz, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt_vt, filt_hz, vec0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ LD_UB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ CALC_MSE_AVG_B(src2, ref2, var, avg);
+ CALC_MSE_AVG_B(src3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 out, pred, filt0, ref = { 0 };
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+ out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 out, pred, filt0;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 pred0, pred1, pred2, pred3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ dst += (4 * dst_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+ tmp2, tmp3);
+ AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+ tmp2, tmp3);
+
+ CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+ CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+ CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+ CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 src10_r, src32_r, src21_r, src43_r;
+ v16u8 out, pred, ref = { 0 };
+ v16u8 src2110, src4332, filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+ v8u16 tmp0, tmp1;
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, filt0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1, out2, out3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ src0 = src4;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 out, pred, ref = { 0 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 pred0, pred1, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v16u8 out0, out1, out2, out3;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ LD_UB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
+ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
+ src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
+ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+ sse); \
+ } \
+ } \
+ \
+ return var; \
+ }
+
+/* clang-format off */
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
+/* clang-format on */
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, ht, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride, uint32_t *sse,
+ const uint8_t *sec_pred) {
+ int32_t diff;
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset];
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+
+ if (yoffset) {
+ if (xoffset) {
+ *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+ v_filter, 64, &diff);
+ } else {
+ *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ v_filter, 64, &diff);
+ }
+ } else {
+ if (xoffset) {
+ *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ h_filter, 64, &diff);
+ } else {
+ *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+ sec_pred, &diff);
+ }
+ }
+
+ return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
+ uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
+ int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_64Wx##ht##H(*sse, diff); \
+ }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
+/* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
new file mode 100644
index 000000000..37b89765d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/subtract_msa.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t pred0, pred1, pred2, pred3;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ uint64_t src0, src1, pred0, pred1;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ LD2(src_ptr, src_stride, src0, src1);
+ src_ptr += (2 * src_stride);
+ LD2(pred_ptr, pred_stride, pred0, pred1);
+ pred_ptr += (2 * pred_stride);
+
+ INSERT_D2_SB(src0, src1, src);
+ INSERT_D2_SB(pred0, pred1, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+ diff_ptr += (2 * diff_stride);
+ }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ int8_t count;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (count = 2; count--;) {
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+ pred7);
+ pred += (8 * pred_stride);
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ LD_SB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_SB2(src, 16, src2, src3);
+ src += src_stride;
+ LD_SB2(src, 16, src4, src5);
+ src += src_stride;
+ LD_SB2(src, 16, src6, src7);
+ src += src_stride;
+
+ LD_SB2(pred, 16, pred0, pred1);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred2, pred3);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred4, pred5);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_SB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+
+ LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+ pred += pred_stride;
+ LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+ }
+}
+
+void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ default:
+ aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
new file mode 100644
index 000000000..cba5d4445
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ { \
+ v8i16 k0_m = __msa_fill_h(cnst0); \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = (v4i32)__msa_fill_h(cnst1); \
+ k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ \
+ ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
+ DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ \
+ DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ }
+
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \
+ dst1, dst2, dst3) \
+ { \
+ v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
+ v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
+ \
+ DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \
+ tp4_m); \
+ DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \
+ tp8_m); \
+ BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
+ BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
+ SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
+ SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
+ dst1, dst2, dst3); \
+ }
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \
+ ({ \
+ v8i16 dst_m; \
+ v4i32 tp0_m, tp1_m; \
+ \
+ DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
+ SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
+ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+ \
+ dst_m; \
+ })
+
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \
+ { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
+ madd0_m, madd1_m, madd2_m, madd3_m); \
+ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
+ }
+
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
+ cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
+ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+ }
+#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
new file mode 100644
index 000000000..745fdfc9c
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/variance_msa.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ }
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+ }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ int32_t ht_cnt;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src, ref;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v8i16 avg2 = { 0 };
+ v8i16 avg3 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 32; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ vec += __msa_hadd_s_w(avg2, avg2);
+ vec += __msa_hadd_s_w(avg3, avg3);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+ uint32_t sum, cnt;
+ v8i16 src0, src1, src2, src3;
+ v4i32 src0_l, src1_l, src2_l, src3_l;
+ v4i32 src0_r, src1_r, src2_r, src3_r;
+ v2i64 sq_src_l = { 0 };
+ v2i64 sq_src_r = { 0 };
+
+ for (cnt = 8; cnt--;) {
+ LD_SH4(src, 8, src0, src1, src2, src3);
+ src += 4 * 8;
+
+ UNPCK_SH_SW(src0, src0_l, src0_r);
+ UNPCK_SH_SW(src1, src1_l, src1_r);
+ UNPCK_SH_SW(src2, src2_l, src2_r);
+ UNPCK_SH_SW(src3, src3_l, src3_r);
+
+ DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+ }
+
+ sq_src_l += __msa_splati_d(sq_src_l, 1);
+ sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+ sum = __msa_copy_s_d(sq_src_l, 0);
+ sum += __msa_copy_s_d(sq_src_r, 0);
+
+ return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_B(src, ref, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src, ref;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = height >> 1; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src2, ref2, var);
+ CALC_MSE_B(src1, ref1, var);
+ CALC_MSE_B(src3, ref3, var);
+
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src2, ref2, var);
+ CALC_MSE_B(src1, ref1, var);
+ CALC_MSE_B(src3, ref3, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride) {
+ uint32_t err = 0;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16i8 src = { 0 };
+ v16i8 ref = { 0 };
+ v16u8 src_vec0, src_vec1;
+ v8i16 diff0, diff1;
+ v4i32 err0 = { 0 };
+ v4i32 err1 = { 0 };
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+ ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
+ HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
+ DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
+ err = HADD_SW_S32(err0);
+ err += HADD_SW_S32(err1);
+
+ return err;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t aom_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, uint32_t *sse) { \
+ int32_t diff; \
+ \
+ *sse = \
+ sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+/* clang-format off */
+AOM_VARIANCE_WDXHT_MSA(4, 4)
+AOM_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_VARIANCE_WDXHT_MSA(8, 4)
+AOM_VARIANCE_WDXHT_MSA(8, 8)
+AOM_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_VARIANCE_WDXHT_MSA(16, 8)
+AOM_VARIANCE_WDXHT_MSA(16, 16)
+AOM_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_VARIANCE_WDXHT_MSA(32, 16)
+AOM_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
+ *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+ return *sse;
+}
+
+uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+ return *sse;
+}
+
+uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h
new file mode 100644
index 000000000..11a8c5ad7
--- /dev/null
+++ b/third_party/aom/aom_dsp/postproc.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_POSTPROC_H_
+#define AOM_DSP_POSTPROC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Fills a noise buffer with gaussian noise strength determined by sigma.
+int aom_setup_noise(double sigma, int size, char *noise);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_DSP_POSTPROC_H_
diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c
new file mode 100644
index 000000000..c60bfdac5
--- /dev/null
+++ b/third_party/aom/aom_dsp/prob.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+
+#if CONFIG_EC_MULTISYMBOL
+#include <string.h>
+#endif
+
+#include "aom_dsp/prob.h"
+
+const uint8_t aom_norm[256] = {
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+ const aom_tree_index *tree,
+ const aom_prob *pre_probs,
+ const unsigned int *counts,
+ aom_prob *probs) {
+ const int l = tree[i];
+ const unsigned int left_count =
+ (l <= 0) ? counts[-l]
+ : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
+ const int r = tree[i + 1];
+ const unsigned int right_count =
+ (r <= 0) ? counts[-r]
+ : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
+ const unsigned int ct[2] = { left_count, right_count };
+ probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
+ return left_count + right_count;
+}
+
+void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
+ const unsigned int *counts, aom_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
+}
+
+#if CONFIG_EC_MULTISYMBOL
+typedef struct tree_node tree_node;
+
+struct tree_node {
+ aom_tree_index index;
+ uint8_t probs[16];
+ uint8_t prob;
+ int path;
+ int len;
+ int l;
+ int r;
+ aom_cdf_prob pdf;
+};
+
+/* Compute the probability of this node in Q23 */
+static uint32_t tree_node_prob(tree_node n, int i) {
+ uint32_t prob;
+ /* 1.0 in Q23 */
+ prob = 16777216;
+ for (; i < n.len; i++) {
+ prob = prob * n.probs[i] >> 8;
+ }
+ return prob;
+}
+
+static int tree_node_cmp(tree_node a, tree_node b) {
+ int i;
+ uint32_t pa;
+ uint32_t pb;
+ for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
+ }
+ pa = tree_node_prob(a, i);
+ pb = tree_node_prob(b, i);
+ return pa > pb ? 1 : pa < pb ? -1 : 0;
+}
+
+/* Given a Q15 probability for symbol subtree rooted at tree[n], this function
+ computes the probability of each symbol (defined as a node that has no
+ children). */
+static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n,
+ aom_cdf_prob pdf) {
+ if (tree[n].l == 0) {
+ /* This prevents probability computations in Q15 that underflow from
+ producing a symbol that has zero probability. */
+ if (pdf == 0) pdf = 1;
+ tree[n].pdf = pdf;
+ return pdf;
+ } else {
+ /* We process the smaller probability first, */
+ if (tree[n].prob < 128) {
+ aom_cdf_prob lp;
+ aom_cdf_prob rp;
+ lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8;
+ lp = tree_node_compute_probs(tree, tree[n].l, lp);
+ rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp);
+ return lp + rp;
+ } else {
+ aom_cdf_prob rp;
+ aom_cdf_prob lp;
+ rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8;
+ rp = tree_node_compute_probs(tree, tree[n].r, rp);
+ lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp);
+ return lp + rp;
+ }
+ }
+}
+
+static int tree_node_extract(tree_node *tree, int n, int symb,
+ aom_cdf_prob *pdf, aom_tree_index *index,
+ int *path, int *len) {
+ if (tree[n].l == 0) {
+ pdf[symb] = tree[n].pdf;
+ if (index != NULL) index[symb] = tree[n].index;
+ if (path != NULL) path[symb] = tree[n].path;
+ if (len != NULL) len[symb] = tree[n].len;
+ return symb + 1;
+ } else {
+ symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len);
+ return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len);
+ }
+}
+
+int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
+ aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index,
+ int *path, int *len) {
+ tree_node symb[2 * 16 - 1];
+ int nodes;
+ int next[16];
+ int size;
+ int nsymbs;
+ int i;
+ /* Create the root node with probability 1 in Q15. */
+ symb[0].index = root;
+ symb[0].path = 0;
+ symb[0].len = 0;
+ symb[0].l = symb[0].r = 0;
+ nodes = 1;
+ next[0] = 0;
+ size = 1;
+ nsymbs = 1;
+ while (size > 0 && nsymbs < 16) {
+ int m;
+ tree_node n;
+ aom_tree_index j;
+ uint8_t prob;
+ m = 0;
+ /* Find the internal node with the largest probability. */
+ for (i = 1; i < size; i++) {
+ if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i;
+ }
+ i = next[m];
+ memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1)));
+ size--;
+ /* Split this symbol into two symbols */
+ n = symb[i];
+ j = n.index;
+ prob = probs[j >> 1];
+ /* Left */
+ n.index = tree[j];
+ n.path <<= 1;
+ n.len++;
+ n.probs[n.len - 1] = prob;
+ symb[nodes] = n;
+ if (n.index > 0) {
+ next[size++] = nodes;
+ }
+ /* Right */
+ n.index = tree[j + 1];
+ n.path += 1;
+ n.probs[n.len - 1] = 256 - prob;
+ symb[nodes + 1] = n;
+ if (n.index > 0) {
+ next[size++] = nodes + 1;
+ }
+ symb[i].prob = prob;
+ symb[i].l = nodes;
+ symb[i].r = nodes + 1;
+ nodes += 2;
+ nsymbs++;
+ }
+ /* Compute the probabilities of each symbol in Q15 */
+ tree_node_compute_probs(symb, 0, CDF_PROB_TOP);
+ /* Extract the cdf, index, path and length */
+ tree_node_extract(symb, 0, 0, cdf, index, path, len);
+ /* Convert to CDF */
+ cdf[0] = AOM_ICDF(cdf[0]);
+ for (i = 1; i < nsymbs; i++) {
+ cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]);
+ }
+// Store symbol count at the end of the CDF
+#if CONFIG_EC_ADAPT
+ cdf[nsymbs] = 0;
+#endif
+ return nsymbs;
+}
+
+/* This code assumes that tree contains as unique leaf nodes the integer values
+ 0 to len - 1 and produces the forward and inverse mapping tables in ind[]
+ and inv[] respectively. */
+static void tree_to_index(int *stack_index, int *ind, int *inv,
+ const aom_tree_index *tree, int value, int index) {
+ value *= 2;
+
+ do {
+ const aom_tree_index content = tree[index];
+ ++index;
+ if (content <= 0) {
+ inv[*stack_index] = -content;
+ ind[-content] = *stack_index;
+ ++(*stack_index);
+ } else {
+ tree_to_index(stack_index, ind, inv, tree, value, content);
+ }
+ } while (++value & 1);
+}
+
+void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) {
+ int stack_index = 0;
+ tree_to_index(&stack_index, ind, inv, tree, 0, 0);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
new file mode 100644
index 000000000..808592923
--- /dev/null
+++ b/third_party/aom/aom_dsp/prob.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_PROB_H_
+#define AOM_DSP_PROB_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_common.h"
+
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_DAALA_EC
+#include "aom_dsp/entcode.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint8_t aom_prob;
+
+// TODO(negge): Rename this aom_prob once we remove vpxbool.
+typedef uint16_t aom_cdf_prob;
+
+#if CONFIG_EC_MULTISYMBOL
+#define CDF_SIZE(x) ((x) + 1)
+#endif
+
+#define CDF_PROB_BITS 15
+#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+
+#if CONFIG_DAALA_EC
+#define AOM_ICDF OD_ICDF
+#else
+#define AOM_ICDF(x) (x)
+#endif
+
+#define MAX_PROB 255
+
+#define aom_prob_half ((aom_prob)128)
+
+typedef int8_t aom_tree_index;
+
+#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
+
+#define MODE_MV_COUNT_SAT 20
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of aom_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const aom_tree_index aom_tree[];
+
+static INLINE aom_prob get_prob(unsigned int num, unsigned int den) {
+ assert(den != 0);
+ {
+ const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+ // (p > 255) ? 255 : (p < 1) ? 1 : p;
+ const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+ return (aom_prob)clipped_prob;
+ }
+}
+
+static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) {
+ const unsigned int den = n0 + n1;
+ if (den == 0) return 128u;
+ return get_prob(n0, den);
+}
+
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
+static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) {
+ return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
+}
+
+static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2],
+ unsigned int count_sat,
+ unsigned int max_update_factor) {
+ const aom_prob prob = get_binary_prob(ct[0], ct[1]);
+ const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat);
+ const unsigned int factor = max_update_factor * count / count_sat;
+ return weighted_prob(pre_prob, prob, factor);
+}
+
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob,
+ const unsigned int ct[2]) {
+ const unsigned int den = ct[0] + ct[1];
+ if (den == 0) {
+ return pre_prob;
+ } else {
+ const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT);
+ const unsigned int factor = count_to_update_factor[count];
+ const aom_prob prob = get_prob(ct[0], den);
+ return weighted_prob(pre_prob, prob, factor);
+ }
+}
+
+void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
+ const unsigned int *counts, aom_prob *probs);
+
+#if CONFIG_EC_MULTISYMBOL
+int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
+ aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
+ int *pth, int *len);
+
+static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
+ const aom_prob *probs, aom_cdf_prob *cdf) {
+ aom_tree_index index[16];
+ int path[16];
+ int dist[16];
+ tree_to_cdf(tree, probs, 0, cdf, index, path, dist);
+}
+
+#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \
+ do { \
+ int i; \
+ for (i = 0; i < u; i++) { \
+ av1_tree_to_cdf(tree, probs[i], cdf[i]); \
+ } \
+ } while (0)
+
+#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u) \
+ do { \
+ int j; \
+ int i; \
+ for (j = 0; j < v; j++) { \
+ for (i = 0; i < u; i++) { \
+ av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \
+ } \
+ } \
+ } while (0)
+
+void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree);
+#endif
+
+DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]);
+
+#if CONFIG_EC_ADAPT
+static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+ const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs);
+ const int rate2 = 5;
+ int i, tmp;
+ int diff;
+#if 1
+ const int tmp0 = 1 << rate2;
+ tmp = AOM_ICDF(tmp0);
+ diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate;
+// Single loop (faster)
+#if CONFIG_DAALA_EC && CONFIG_EC_SMALLMUL
+ for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) {
+ tmp -= (i == val ? diff : 0);
+ cdf[i] += ((tmp - cdf[i]) >> rate);
+ }
+#else
+ for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) {
+ tmp += (i == val ? diff : 0);
+ cdf[i] -= ((cdf[i] - tmp) >> rate);
+ }
+#endif
+#else
+ for (i = 0; i < nsymbs; ++i) {
+ tmp = (i + 1) << rate2;
+ cdf[i] -= ((cdf[i] - tmp) >> rate);
+ }
+ diff = CDF_PROB_TOP - cdf[nsymbs - 1];
+
+ for (i = val; i < nsymbs; ++i) {
+ cdf[i] += diff;
+ }
+#endif
+ cdf[nsymbs] += (cdf[nsymbs] < 32);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
new file mode 100644
index 000000000..461c13729
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/psnr.h"
+#include "aom_scale/yv12config.h"
+
+double aom_sse_to_psnr(double samples, double peak, double sse) {
+ if (sse > 0.0) {
+ const double psnr = 10.0 * log10(samples * peak * peak / sse);
+ return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+ } else {
+ return MAX_PSNR;
+ }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+* and highbd_8_variance(). It should not.
+*/
+static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, unsigned int *sse,
+ int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h, uint64_t *sse, int64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h, unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ const int dw = width % 16;
+ const int dh = height % 16;
+ int64_t total_sse = 0;
+ unsigned int sse = 0;
+ int sum = 0;
+ int x, y;
+
+ if (dw > 0) {
+ encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
+ height, &sse, &sum);
+ total_sse += sse;
+ }
+
+ if (dh > 0) {
+ encoder_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh,
+ &sse, &sum);
+ total_sse += sse;
+ }
+
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+
+ pa += 16;
+ pb += 16;
+ }
+
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+
+ return total_sse;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height, unsigned int input_shift) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t total_sse = 0;
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ int64_t diff;
+ diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+ total_sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int64_t total_sse = 0;
+ int x, y;
+ const int dw = width % 16;
+ const int dh = height % 16;
+ unsigned int sse = 0;
+ int sum = 0;
+ if (dw > 0) {
+ encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height, &sse, &sum);
+ total_sse += sse;
+ }
+ if (dh > 0) {
+ encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
+ total_sse += sse;
+ }
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+ pa += 16;
+ pb += 16;
+ }
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+ return total_sse;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+ width, height);
+}
+
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+
+ return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+ b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+ width, height);
+}
+
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+
+ return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+ b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+ width, height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+
+ return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+#if CONFIG_HIGHBITDEPTH
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(
+ a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride,
+ b->u_buffer + vstart * b->uv_stride + hstart,
+ b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride,
+ b->v_buffer + vstart * b->uv_stride + hstart,
+ b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ uint32_t bit_depth, uint32_t in_bit_depth) {
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const unsigned int input_shift = bit_depth - in_bit_depth;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (input_shift) {
+ sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h, input_shift);
+ } else {
+ sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h);
+ }
+ } else {
+ sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+ }
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
+
+#endif // !CONFIG_HIGHBITDEPTH
+
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr) {
+ static const double peak = 255.0;
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ const uint64_t sse =
+ get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
new file mode 100644
index 000000000..480140e6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_PSNR_H_
+#define AOM_DSP_PSNR_H_
+
+#include "aom_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ double psnr[4]; // total/y/u/v
+ uint64_t sse[4]; // total/y/u/v
+ uint32_t samples[4]; // total/y/u/v
+} PSNR_STATS;
+
+/*!\brief Converts SSE to PSNR
+*
+* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+*
+* \param[in] samples Number of samples
+* \param[in] peak Max sample value
+* \param[in] sse Sum of squared errors
+*/
+double aom_sse_to_psnr(double samples, double peak, double sse);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_HIGHBITDEPTH
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr);
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+ double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
new file mode 100644
index 000000000..aeefd5908
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * This code was originally written by: Gregory Maxwell, at the Daala
+ * project.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+#if !defined(M_PI)
+#define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ aom_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ aom_highbd_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+#endif
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+ { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+ 0.678296995242, 0.466224900598, 0.3265091542 },
+ { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+ 0.868920337363, 0.61280991668, 0.436405793551 },
+ { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+ 0.670882927016, 0.501731932449, 0.372504254596 },
+ { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+ 0.48309405692, 0.380429446972, 0.295774038565 },
+ { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+ 0.352889268808, 0.283006984131, 0.226951348204 },
+ { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+ 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+ { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+ 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+ { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+ 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+ { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+ 0.898018824055, 0.74725392039, 0.615105596242 },
+ { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+ 1.17428548929, 0.996404342439, 0.830890433625 },
+ { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+ 0.960060382087, 0.849823426169, 0.731221236837 },
+ { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+ 0.751437590932, 0.685398513368, 0.608694761374 },
+ { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+ 0.605503172737, 0.55002013668, 0.495804539034 },
+ { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+ 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+ { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+ 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+ { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+ 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+ { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+ 0.867069376285, 0.721500455585, 0.593906509971 },
+ { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+ 1.13381474809, 0.962064122248, 0.802254508198 },
+ { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+ 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+ { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+ 0.725539939514, 0.661776842059, 0.587716619023 },
+ { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+ 0.584635025748, 0.531064164893, 0.478717061273 },
+ { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+ 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+ { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+ 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+ { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+ 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int bit_depth) {
+ int16_t pix_max = 255;
+ assert(_score * _weight >= 0.0);
+ if (bit_depth == 10)
+ pix_max = 1023;
+ else if (bit_depth == 12)
+ pix_max = 4095;
+
+ if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+ return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+ const unsigned char *dst, int _dystride, double _par,
+ int _w, int _h, int _step, const double _csf[8][8],
+ uint32_t bit_depth, uint32_t _shift) {
+ double ret;
+ const uint8_t *_src8 = src;
+ const uint8_t *_dst8 = dst;
+ const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+ int16_t dct_s[8 * 8], dct_d[8 * 8];
+ tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+ double mask[8][8];
+ int pixels;
+ int x;
+ int y;
+ (void)_par;
+ ret = pixels = 0;
+ /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+ their masking table as "we have used the quantization table for the
+ color component Y of JPEG [6] that has been also obtained on the
+ basis of CSF. Note that the values in quantization table JPEG have
+ been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+ was also constructed from the JPEG matrices. I can not find any obvious
+ scheme of normalizing to produce their table, but if I multiply their
+ CSF by 0.38857 and square the result I get their masking table.
+ I have no idea where this constant comes from, but deviating from it
+ too greatly hurts MOS agreement.
+
+ [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+ Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+ of DCT basis functions", CD-ROM Proceedings of the Third
+ International Workshop on Video Processing and Quality Metrics for Consumer
+ Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+ for (x = 0; x < 8; x++)
+ for (y = 0; y < 8; y++)
+ mask[x][y] =
+ (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
+ for (y = 0; y < _h - 7; y += _step) {
+ for (x = 0; x < _w - 7; x += _step) {
+ int i;
+ int j;
+ double s_means[4];
+ double d_means[4];
+ double s_vars[4];
+ double d_vars[4];
+ double s_gmean = 0;
+ double d_gmean = 0;
+ double s_gvar = 0;
+ double d_gvar = 0;
+ double s_mask = 0;
+ double d_mask = 0;
+ for (i = 0; i < 4; i++)
+ s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ if (bit_depth == 8 && _shift == 0) {
+ dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+ dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+ } else if (bit_depth == 10 || bit_depth == 12) {
+ dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+ dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+ }
+ s_gmean += dct_s[i * 8 + j];
+ d_gmean += dct_d[i * 8 + j];
+ s_means[sub] += dct_s[i * 8 + j];
+ d_means[sub] += dct_d[i * 8 + j];
+ }
+ }
+ s_gmean /= 64.f;
+ d_gmean /= 64.f;
+ for (i = 0; i < 4; i++) s_means[i] /= 16.f;
+ for (i = 0; i < 4; i++) d_means[i] /= 16.f;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+ d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+ s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
+ (dct_s[i * 8 + j] - s_means[sub]);
+ d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
+ (dct_d[i * 8 + j] - d_means[sub]);
+ }
+ }
+ s_gvar *= 1 / 63.f * 64;
+ d_gvar *= 1 / 63.f * 64;
+ for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
+ for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
+ if (s_gvar > 0)
+ s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+ if (d_gvar > 0)
+ d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+#if CONFIG_HIGHBITDEPTH
+ if (bit_depth == 10 || bit_depth == 12) {
+ hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ }
+#endif
+ if (bit_depth == 8) {
+ od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ }
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+ s_mask = sqrt(s_mask * s_gvar) / 32.f;
+ d_mask = sqrt(d_mask * d_gvar) / 32.f;
+ if (d_mask > s_mask) s_mask = d_mask;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ double err;
+ err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+ if (i != 0 || j != 0)
+ err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+ ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+ pixels++;
+ }
+ }
+ }
+ }
+ if (pixels <= 0) return 0;
+ ret /= pixels;
+ return ret;
+}
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
+ double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
+ uint32_t bd, uint32_t in_bd) {
+ double psnrhvs;
+ const double par = 1.0;
+ const int step = 7;
+ uint32_t bd_shift = 0;
+ aom_clear_system_state();
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+ assert(bd >= in_bd);
+
+ bd_shift = bd - in_bd;
+
+ *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, par, src->y_crop_width,
+ src->y_crop_height, step, csf_y, bd, bd_shift);
+ *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, par, src->uv_crop_width,
+ src->uv_crop_height, step, csf_cb420, bd, bd_shift);
+ *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, par, src->uv_crop_width,
+ src->uv_crop_height, step, csf_cr420, bd, bd_shift);
+ psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+ return convert_score_db(psnrhvs, 1.0, in_bd);
+}
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
new file mode 100644
index 000000000..0759c22e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+
+static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+ const int log_scale) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+#if CONFIG_AOM_QM
+ const qm_val_t wt = qm_ptr[rc];
+ const int coeff = coeff_ptr[rc] * wt;
+#else
+ const int coeff = coeff_ptr[rc];
+#endif // CONFIG_AOM_QM
+
+#if CONFIG_AOM_QM
+ if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+ coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+ non_zero_count--;
+#else
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--;
+#endif // CONFIG_AOM_QM
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32;
+
+#if CONFIG_AOM_QM
+ const qm_val_t wt = qm_ptr[rc];
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+#else
+ if (abs_coeff >= zbins[rc != 0]) {
+#endif // CONFIG_AOM_QM
+ int64_t tmp =
+ clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+#if CONFIG_AOM_QM
+ tmp *= wt;
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS)); // quantization
+#else
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale)); // quantization
+#endif // CONFIG_AOM_QM
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+#if CONFIG_AOM_QM
+ const int dequant =
+ (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+#else
+ dqcoeff_ptr[rc] =
+ qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale);
+#endif // CONFIG_AOM_QM
+
+ if (tmp32) eob = i;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 0);
+}
+
+void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 1);
+}
+
+#if CONFIG_TX64X64
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 2);
+}
+#endif // CONFIG_TX64X64
+
+#if CONFIG_AOM_QM
+void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp, eob = -1;
+ int32_t tmp32;
+ int dequant =
+ (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+ const int n_coeffs = 1024;
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp, eob = -1;
+ int32_t tmp32;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+ INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dequant =
+ (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+ const int n_coeffs = 1024;
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp, eob = -1;
+ int32_t tmp32;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+ INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dequant =
+ (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ int eob = -1;
+ int dequant =
+ (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ const int n_coeffs = 1024;
+ int eob = -1;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ const int n_coeffs = 1024;
+ int eob = -1;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int dequant;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr[rc];
+ const int coeff = coeff_ptr[rc] * wt;
+
+ if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+ coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+ if (abs_qcoeff) eob = i;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_32x32_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ int dequant;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr[rc];
+ const int coeff = coeff_ptr[rc] * wt;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+ coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const qm_val_t wt = qm_ptr[rc];
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+ int idx = 0;
+ int idx_arr[4096];
+ int i, eob = -1;
+ int dequant;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr[rc];
+ const int coeff = coeff_ptr[rc] * wt;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+ coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const qm_val_t wt = qm_ptr[rc];
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+
+#else // CONFIG_AOM_QM
+
+void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 16;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+ if (tmp) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+ INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 15;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+ if (tmp) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int n_coeffs = 4096;
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+ INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 14;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4;
+ if (tmp) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr,
+ uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr,
+ uint16_t *eob_ptr) {
+ const int n_coeffs = 4096;
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4;
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ if (abs_qcoeff) eob = i;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_32x32_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+ int idx = 0;
+ int idx_arr[4096];
+ int i, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AOM_QM
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
new file mode 100644
index 000000000..fe49b830f
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_QUANTIZE_H_
+#define AOM_DSP_QUANTIZE_H_
+
+#include "./aom_config.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_AOM_QM
+void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#endif // CONFIG_TX64X64
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr);
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr);
+void aom_highbd_quantize_dc_32x32(
+ const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr);
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(
+ const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr);
+#endif // CONFIG_TX64X64
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#endif // CONFIG_HIGHBITDEPTH
+
+#else // CONFIG_AOM_QM
+
+void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif // CONFIG_TX64X64
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr);
+void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr,
+ const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr,
+ const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AOM_QM
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
new file mode 100644
index 000000000..3e1070519
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define sadMxN(m, n) \
+ unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint8_t comp_pred[m * n]; \
+ aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return sad(src, src_stride, comp_pred, m, m, n); \
+ }
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK(m, n, k) \
+ void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sad_array[i] = \
+ aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+ }
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D(m, n) \
+ void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+ }
+
+/* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// 128x128
+sadMxN(128, 128)
+sadMxNxK(128, 128, 3)
+sadMxNxK(128, 128, 8)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+// 64x64
+sadMxN(64, 64)
+sadMxNxK(64, 64, 3)
+sadMxNxK(64, 64, 8)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNxK(32, 32, 3)
+sadMxNxK(32, 32, 8)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNxK(16, 16, 3)
+sadMxNxK(16, 16, 8)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNxK(16, 8, 3)
+sadMxNxK(16, 8, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNxK(8, 16, 3)
+sadMxNxK(8, 16, 8)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNxK(8, 8, 3)
+sadMxNxK(8, 8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNxK(8, 4, 8)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNxK(4, 8, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNxK(4, 4, 3)
+sadMxNxK(4, 4, 8)
+sadMxNx4D(4, 4)
+/* clang-format on */
+
+#if CONFIG_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+ const uint16_t *b, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define highbd_sadMxN(m, n) \
+ unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int aom_highbd_sad##m##x##n##_avg_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint16_t comp_pred[m * n]; \
+ aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
+ }
+
+#define highbd_sadMxNxK(m, n, k) \
+ void aom_highbd_sad##m##x##n##x##k##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref_array, \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) { \
+ sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
+ &ref_array[i], ref_stride); \
+ } \
+ }
+
+#define highbd_sadMxNx4D(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ }
+
+/* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNxK(128, 128, 3)
+highbd_sadMxNxK(128, 128, 8)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxNxK(64, 64, 3)
+highbd_sadMxNxK(64, 64, 8)
+highbd_sadMxNx4D(64, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxNxK(32, 32, 3)
+highbd_sadMxNxK(32, 32, 8)
+highbd_sadMxNx4D(32, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxNxK(16, 16, 3)
+highbd_sadMxNxK(16, 16, 8)
+highbd_sadMxNx4D(16, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxNxK(16, 8, 3)
+highbd_sadMxNxK(16, 8, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxNxK(8, 16, 3)
+highbd_sadMxNxK(8, 16, 8)
+highbd_sadMxNx4D(8, 16)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxNxK(8, 8, 3)
+highbd_sadMxNxK(8, 8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxNxK(8, 4, 8)
+highbd_sadMxNx4D(8, 4)
+
+// 4x8
+highbd_sadMxN(4, 8)
+highbd_sadMxNxK(4, 8, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 4x4
+highbd_sadMxN(4, 4)
+highbd_sadMxNxK(4, 4, 3)
+highbd_sadMxNxK(4, 4, 8)
+highbd_sadMxNx4D(4, 4)
+/* clang-format on */
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+ static INLINE
+ unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, const uint8_t *m, int m_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+ sad = (sad + 31) >> 6;
+
+ return sad;
+}
+
+#define MASKSADMxN(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \
+ n); \
+ }
+
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+#endif // CONFIG_EXT_PARTITION
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+/* clang-format on */
+
+#if CONFIG_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride, int width,
+ int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+ sad = (sad + 31) >> 6;
+
+ return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \
+ msk_stride, m, n); \
+ }
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+#endif // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AV1 && CONFIG_EXT_INTER
+
+#if CONFIG_AV1 && CONFIG_MOTION_VAR
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+ pre += pre_stride;
+ wsrc += width;
+ mask += width;
+ }
+
+ return sad;
+}
+
+#define OBMCSADMxN(m, n) \
+ unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+ }
+
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+#endif // CONFIG_EXT_PARTITION
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+/* clang-format on */
+
+#if CONFIG_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+ pre += pre_stride;
+ wsrc += width;
+ mask += width;
+ }
+
+ return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n) \
+ unsigned int aom_highbd_obmc_sad##m##x##n##_c( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+ }
+
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+#endif // CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+/* clang-format on */
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
new file mode 100644
index 000000000..8f6509383
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V128_INTRINSICS_H
+#define _V128_INTRINSICS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./v128_intrinsics_c.h"
+#include "./v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v128 v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
+SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
+SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
+SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
+ return c_v128_from_64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
+ return c_v128_from_v64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return c_v128_from_32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+ return c_v128_load_unaligned(p);
+}
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+ return c_v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+ c_v128_store_unaligned(p, a);
+}
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+ c_v128_store_aligned(p, a);
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+ return c_v128_align(a, b, c);
+}
+
+SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+
+typedef uint32_t sad128_internal;
+SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+ return c_v128_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+ return c_v128_sad_u8_sum(s);
+}
+typedef uint32_t ssd128_internal;
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+ return c_v128_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+ return c_v128_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+ return c_v128_dotp_s16(a, b);
+}
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+ return c_v128_mullo_s16(a, b);
+}
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+ return c_v128_mulhi_s16(a, b);
+}
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+ return c_v128_mullo_s32(a, b);
+}
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+ return c_v128_unziplo_8(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+ return c_v128_unziphi_8(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+ return c_v128_unziplo_16(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+ return c_v128_unziphi_16(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+ return c_v128_unziplo_32(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+ return c_v128_unziphi_32(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+ return c_v128_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+ return c_v128_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+ return c_v128_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+ return c_v128_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+ return c_v128_pack_s32_s16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+ return c_v128_pack_s16_u8(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+ return c_v128_pack_s16_s8(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+ return c_v128_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+ return c_v128_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+ return c_v128_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+ return c_v128_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
+ return c_v128_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+ return c_v128_cmpgt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+ return c_v128_cmplt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+ return c_v128_shl_8(a, c);
+}
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+ return c_v128_shr_u8(a, c);
+}
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+ return c_v128_shr_s8(a, c);
+}
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+ return c_v128_shl_16(a, c);
+}
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+ return c_v128_shr_u16(a, c);
+}
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+ return c_v128_shr_s16(a, c);
+}
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+ return c_v128_shl_32(a, c);
+}
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+ return c_v128_shr_u32(a, c);
+}
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+ return c_v128_shr_s32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+ return c_v128_shr_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+ return c_v128_shl_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
+ return c_v128_shl_n_8(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
+ return c_v128_shl_n_16(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
+ return c_v128_shl_n_32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
+ return c_v128_shr_n_u8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
+ return c_v128_shr_n_u16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
+ return c_v128_shr_n_u32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
+ return c_v128_shr_n_s8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
+ return c_v128_shr_n_s16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
+ return c_v128_shr_n_s32(a, n);
+}
+
+#endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
new file mode 100644
index 000000000..0377d4ce1
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V128_INTRINSICS_H
+#define _V128_INTRINSICS_H
+
+#include <arm_neon.h>
+#include "./v64_intrinsics_arm.h"
+
+typedef int64x2_t v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+ return v64_low_u32(vget_low_s64(a));
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+ return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+ return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+ return v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
+ vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
+ vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+ return c ? vreinterpretq_s64_s8(
+ vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
+ : b;
+#else
+ return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
+ v64_align(v128_high_v64(b), v128_low_v64(b), c))
+ : v128_from_v64(
+ v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
+ v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
+#endif
+}
+
+SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+
+SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) {
+ return vreinterpretq_s64_u8(vdupq_n_u8(x));
+}
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) {
+ return vreinterpretq_s64_u16(vdupq_n_u16(x));
+}
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) {
+ return vreinterpretq_s64_u32(vdupq_n_u32(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+ return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
+ v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+ uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
+ return vget_lane_s32(
+ vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+}
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+ return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
+}
+
+typedef struct { sad64_internal hi, lo; } sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init() {
+ sad128_internal s;
+ s.hi = s.lo = vdupq_n_u16(0);
+ return s;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v128_sad_u8_sum().
+ The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+ sad128_internal r;
+ r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+ r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+ return r;
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+ return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
+}
+
+typedef struct { ssd64_internal hi, lo; } ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
+ ssd128_internal s;
+ s.hi = s.lo = (ssd64_internal)(uint64_t)0;
+ return s;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+ ssd128_internal r;
+ r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+ r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+ return r;
+}
+
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+ return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
+}
+
+SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
+
+SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
+
+SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
+
+SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
+
+SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
+ return vreinterpretq_s64_u32(
+ vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
+ return vreinterpretq_s64_u16(
+ vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
+ return vreinterpretq_s64_s8(
+ vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
+ return vreinterpretq_s64_s32(
+ vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_abs_s16(v128 x) {
+ return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 x) {
+ return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
+}
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+ return vreinterpretq_s64_s32(
+ vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+ return vreinterpretq_s64_s16(
+ vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+ return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
+ v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+ return vreinterpretq_s64_s32(
+ vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+ return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
+ v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+}
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+ return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
+ v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
+ return vreinterpretq_s64_u16(
+ vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
+ return vreinterpretq_s64_s8(
+ vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
+ return vreinterpretq_s64_s8(
+ vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_s16(
+ vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+ uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+ return vreinterpretq_s64_u8(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+ uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+ return vreinterpretq_s64_u8(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
+ uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+ return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+ int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+ return vreinterpretq_s64_s16(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+ int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+ return vreinterpretq_s64_s16(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
+ uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+ return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+ int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+ return vreinterpretq_s64_s32(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+ int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+ return vreinterpretq_s64_s32(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
+ uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
+ return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+ return v128_from_v64(vget_low_u64((uint64x2_t)a),
+ vget_low_u64((uint64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+ return v128_from_v64(vget_high_u64((uint64x2_t)a),
+ vget_high_u64((uint64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+ uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+ return vreinterpretq_s64_u8(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+ uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+ return vreinterpretq_s64_u8(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+ uint16x8x2_t r =
+ vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+ return vreinterpretq_s64_u16(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+ uint16x8x2_t r =
+ vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+ return vreinterpretq_s64_u16(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+ uint32x4x2_t r =
+ vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+ return vreinterpretq_s64_u32(r.val[0]);
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+ uint32x4x2_t r =
+ vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+ return vreinterpretq_s64_u32(r.val[1]);
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+ return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+ return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+ return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+ return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+ return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+ return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+ return v128_from_v64(
+ vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
+ vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+ return v128_from_v64(
+ vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
+ vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+ return v128_from_v64(
+ vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
+ vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+ return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+ return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+ return vreinterpretq_s64_u32(
+ vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+ return vreinterpretq_s64_s32(
+ vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+ return vreinterpretq_s64_u32(
+ vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+ return vreinterpretq_s64_s32(
+ vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+ return v128_from_64(
+ (uint64_t)vreinterpret_s64_u8(
+ vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
+ vget_high_u8(vreinterpretq_u8_s64(x)) } },
+ vreinterpret_u8_s64(vget_high_s64(pattern)))),
+ (uint64_t)vreinterpret_s64_u8(
+ vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
+ vget_high_u8(vreinterpretq_u8_s64(x)) } },
+ vreinterpret_u8_s64(vget_low_s64(pattern)))));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
+ return vreinterpretq_s64_u8(
+ vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_u16(
+ vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
+ return vreinterpretq_s64_u16(
+ vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
+ return vreinterpretq_s64_u16(
+ vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+ return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
+ vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+ return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
+ vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+ return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8(
+ vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+ return (c > 15) ? v128_zero()
+ : vreinterpretq_s64_u16(
+ vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+ return (c > 15) ? v128_zero()
+ : vreinterpretq_s64_u16(
+ vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+ return (c > 15) ? v128_ones()
+ : vreinterpretq_s64_s16(
+ vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+ return (c > 31) ? v128_zero()
+ : vreinterpretq_s64_u32(
+ vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+ return (c > 31) ? v128_zero()
+ : vreinterpretq_s64_u32(
+ vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+ return (c > 31) ? v128_ones()
+ : vreinterpretq_s64_s32(
+ vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+ return n < 8
+ ? v128_from_64(
+ (uint64_t)vorr_u64(
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+ n * 8),
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+ (8 - n) * 8)),
+ (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+ n * 8))
+ : (n == 8 ? v128_from_64(
+ (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+ : v128_from_64((uint64_t)vshl_n_u64(
+ vreinterpret_u64_s64(vget_low_s64(a)),
+ (n - 8) * 8),
+ 0));
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+ return n < 8
+ ? v128_from_64(
+ vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
+ vorr_u64(
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+ (8 - n) * 8)))
+ : (n == 8
+ ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a)))
+ : v128_from_64(
+ 0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+ (n - 8) * 8)));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+ return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+ return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+ return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+ return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
+}
+
+#else
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+ if (n < 8)
+ return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
+ v64_shr_n_byte(v128_low_v64(a), 8 - n)),
+ v64_shl_n_byte(v128_low_v64(a), n));
+ else
+ return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+ if (n < 8)
+ return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
+ v64_or(v64_shr_n_byte(v128_low_v64(a), n),
+ v64_shl_n_byte(v128_high_v64(a), 8 - n)));
+ else
+ return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+ return v128_shl_8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+ return v128_shr_u8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+ return v128_shr_s8(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+ return v128_shl_16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+ return v128_shr_u16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+ return v128_shr_s16(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+ return v128_shl_32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+ return v128_shr_u32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+ return v128_shr_s32(a, c);
+}
+
+#endif
+
+#endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
new file mode 100644
index 000000000..32e7c32de
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V128_INTRINSICS_C_H
+#define _V128_INTRINSICS_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./v64_intrinsics_c.h"
+#include "./aom_config.h"
+
+typedef union {
+ uint8_t u8[16];
+ uint16_t u16[8];
+ uint32_t u32[4];
+ uint64_t u64[2];
+ int8_t s8[16];
+ int16_t s16[8];
+ int32_t s32[4];
+ int64_t s64[2];
+ c_v64 v64[2];
+} c_v128;
+
+SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
+
+SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
+ c_v128 t;
+ t.u64[1] = hi;
+ t.u64[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
+ c_v128 t;
+ t.v64[1] = hi;
+ t.v64[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
+ uint32_t d) {
+ c_v128 t;
+ t.u32[3] = a;
+ t.u32[2] = b;
+ t.u32[1] = c;
+ t.u32[0] = d;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
+ c_v128 t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 16; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 15) {
+ fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
+ abort();
+ }
+ return c_v128_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 16; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 15) {
+ fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
+ abort();
+ }
+ c_v128_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v128 c_v128_zero() {
+ c_v128 t;
+ t.u64[1] = t.u64[0] = 0;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+ return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
+ return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
+ c_v64_dotp_s16(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
+ return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
+}
+
+typedef uint32_t c_sad128_internal;
+
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ v128_sad_u8_sum().
+ The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
+ c_v128 b) {
+ int c;
+ for (c = 0; c < 16; c++)
+ s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
+
+typedef uint32_t c_ssd128_internal;
+
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
+ c_v128 b) {
+ int c;
+ for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
+
+SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
+ c_v64_or(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
+ c_v64_xor(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
+ c_v64_and(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
+ c_v64_andn(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
+ c_v64_add_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
+ c_v64_add_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
+ c_v64_sadd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
+ c_v64_add_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
+ c_v128 t;
+ t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+ t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+ t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+ t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
+ c_v64_sub_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
+ c_v64_ssub_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
+ c_v64_ssub_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
+ c_v64_sub_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
+ c_v64_ssub_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
+ c_v64_ssub_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
+ c_v64_sub_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+ return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
+ c_v64 lo_bits = c_v64_mullo_s16(a, b);
+ c_v64 hi_bits = c_v64_mulhi_s16(a, b);
+ return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
+ c_v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
+ c_v64_mullo_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
+ c_v64_mulhi_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
+ c_v64_mullo_s32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
+ c_v64_madd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
+ c_v64_madd_us8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
+ c_v64_avg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
+ c_v64_rdavg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
+ c_v64_avg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
+ c_v64_min_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
+ c_v64_max_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
+ c_v64_min_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
+ c_v64_max_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
+ c_v64_min_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
+ c_v64_max_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
+ c_v64_ziplo_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
+ c_v64_ziplo_8(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
+ c_v64_ziplo_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
+ c_v64_ziplo_16(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
+ c_v64_ziplo_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
+ c_v64_ziplo_32(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(a.v64[1], b.v64[1]);
+}
+
+SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u8[15] = b.u8[15];
+ t.u8[14] = b.u8[13];
+ t.u8[13] = b.u8[11];
+ t.u8[12] = b.u8[9];
+ t.u8[11] = b.u8[7];
+ t.u8[10] = b.u8[5];
+ t.u8[9] = b.u8[3];
+ t.u8[8] = b.u8[1];
+ t.u8[7] = a.u8[15];
+ t.u8[6] = a.u8[13];
+ t.u8[5] = a.u8[11];
+ t.u8[4] = a.u8[9];
+ t.u8[3] = a.u8[7];
+ t.u8[2] = a.u8[5];
+ t.u8[1] = a.u8[3];
+ t.u8[0] = a.u8[1];
+ } else {
+ t.u8[15] = a.u8[14];
+ t.u8[14] = a.u8[12];
+ t.u8[13] = a.u8[10];
+ t.u8[12] = a.u8[8];
+ t.u8[11] = a.u8[6];
+ t.u8[10] = a.u8[4];
+ t.u8[9] = a.u8[2];
+ t.u8[8] = a.u8[0];
+ t.u8[7] = b.u8[14];
+ t.u8[6] = b.u8[12];
+ t.u8[5] = b.u8[10];
+ t.u8[4] = b.u8[8];
+ t.u8[3] = b.u8[6];
+ t.u8[2] = b.u8[4];
+ t.u8[1] = b.u8[2];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+ : _c_v128_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+ : _c_v128_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u16[7] = b.u16[7];
+ t.u16[6] = b.u16[5];
+ t.u16[5] = b.u16[3];
+ t.u16[4] = b.u16[1];
+ t.u16[3] = a.u16[7];
+ t.u16[2] = a.u16[5];
+ t.u16[1] = a.u16[3];
+ t.u16[0] = a.u16[1];
+ } else {
+ t.u16[7] = a.u16[6];
+ t.u16[6] = a.u16[4];
+ t.u16[5] = a.u16[2];
+ t.u16[4] = a.u16[0];
+ t.u16[3] = b.u16[6];
+ t.u16[2] = b.u16[4];
+ t.u16[1] = b.u16[2];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+ : _c_v128_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+ : _c_v128_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u32[3] = b.u32[3];
+ t.u32[2] = b.u32[1];
+ t.u32[1] = a.u32[3];
+ t.u32[0] = a.u32[1];
+ } else {
+ t.u32[3] = a.u32[2];
+ t.u32[2] = a.u32[0];
+ t.u32[1] = b.u32[2];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+ : _c_v128_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+ : _c_v128_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
+ c_v64_unpacklo_u8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
+ c_v64_unpacklo_u8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
+ c_v64_unpacklo_s8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
+ c_v64_unpacklo_s8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
+ c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
+ c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
+ c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
+ c_v64_unpacklo_u16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
+ c_v64_unpacklo_s16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
+ c_v64_unpacklo_u16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
+ c_v64_unpacklo_s16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 16; c++) {
+ if (pattern.u8[c] & ~15) {
+ fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
+ c);
+ abort();
+ }
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+ : pattern.u8[c] & 15];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
+ c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
+ c_v64_cmplt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
+ c_v64_cmpeq_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
+ c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
+ c_v64_cmplt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
+ c_v64_cmpeq_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
+ if (n < 8)
+ return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
+ c_v64_shr_n_byte(a.v64[0], 8 - n)),
+ c_v64_shl_n_byte(a.v64[0], n));
+ else
+ return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
+ if (n < 8)
+ return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
+ c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
+ c_v64_shl_n_byte(a.v64[1], 8 - n)));
+ else
+ return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
+}
+
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
+ if (SIMD_CHECK && c > 15) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
+ : b;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
+ c_v64_shr_u16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
+ c_v64_shr_s16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
+ c_v64_shr_u32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
+ c_v64_shr_s32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) {
+ return c_v128_shl_8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) {
+ return c_v128_shl_16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) {
+ return c_v128_shl_32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) {
+ return c_v128_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) {
+ return c_v128_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) {
+ return c_v128_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) {
+ return c_v128_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) {
+ return c_v128_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) {
+ return c_v128_shr_s32(a, n);
+}
+
+#endif /* _V128_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
new file mode 100644
index 000000000..cca1788d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V128_INTRINSICS_H
+#define _V128_INTRINSICS_H
+
+#include "./v64_intrinsics_x86.h"
+
+typedef __m128i v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+ return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) {
+ return _mm_unpacklo_epi64(a, v64_zero());
+}
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
+ return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+ return v128_from_v64(v64_from_64(a), v64_from_64(b));
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return _mm_set_epi32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+ return _mm_load_si128((__m128i *)p);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+#if defined(__SSSE3__)
+ return (__m128i)_mm_lddqu_si128((__m128i *)p);
+#else
+ return _mm_loadu_si128((__m128i *)p);
+#endif
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+ _mm_store_si128((__m128i *)p, a);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+ _mm_storeu_si128((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#if defined(__SSSE3__)
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+ return c ? _mm_alignr_epi8(a, b, c) : b;
+}
+#else
+#define v128_align(a, b, c) \
+ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#else
+#if defined(__SSSE3__)
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
+#else
+#define v128_align(a, b, c) \
+ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#endif
+
+SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+ return _mm_madd_epi16(a, _mm_set1_epi16(1));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi16(a);
+#else
+ return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+ return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
+ return _mm_unpacklo_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
+ return _mm_unpackhi_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
+ return _mm_unpacklo_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
+ return _mm_unpackhi_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
+ return _mm_unpacklo_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
+ return _mm_unpackhi_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+ return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+ return _mm_unpackhi_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+ return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+ v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
+ return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+ _mm_shuffle_epi8(a, order));
+#else
+ return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+ return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+ v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
+ return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+ _mm_shuffle_epi8(a, order));
+#else
+ return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+ return _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+ return _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+ return _mm_unpackhi_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+ return _mm_packs_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+ return _mm_packus_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+ return _mm_packs_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+ return _mm_unpackhi_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(x, pattern);
+#else
+ v128 output;
+ unsigned char *input = (unsigned char *)&x;
+ unsigned char *index = (unsigned char *)&pattern;
+ char *selected = (char *)&output;
+ int counter;
+
+ for (counter = 0; counter < 16; counter++) {
+ selected[counter] = input[index[counter] & 15];
+ }
+
+ return output;
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+ v128 r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+ v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
+ _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
+ return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
+#else
+ return (int64_t)_mm_cvtsi128_si32(r) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
+ v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
+ return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
+}
+
+typedef v128 sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ v128_sad_sum().
+ The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+ return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+ return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+typedef v128 ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+ v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+ _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+ v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+ _mm_unpackhi_epi8(b, _mm_setzero_si128()));
+ v128 rl = _mm_madd_epi16(l, l);
+ v128 rh = _mm_madd_epi16(h, h);
+ v128 c = _mm_cvtsi32_si128(32);
+ rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
+ rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
+ rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
+ rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
+ return _mm_add_epi64(
+ s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+ return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+ v64 lo_bits = v64_mullo_s16(a, b);
+ v64 hi_bits = v64_mulhi_s16(a, b);
+ return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
+ v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+ return _mm_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+ return _mm_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_mullo_epi32(a, b);
+#else
+ return _mm_unpacklo_epi32(
+ _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
+ _mm_shuffle_epi32(
+ _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
+#endif
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+ return _mm_maddubs_epi16(a, b);
+#else
+ return _mm_packs_epi32(
+ _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
+ _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
+#endif
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
+ return _mm_sub_epi8(_mm_avg_epu8(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_min_epi8(a, b);
+#else
+ v128 mask = _mm_cmplt_epi8(a, b);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_max_epi8(a, b);
+#else
+ v128 mask = _mm_cmplt_epi8(b, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+ return _mm_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+ return _mm_cmplt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+ __m128i x = _mm_cvtsi32_si128(c + 8);
+ return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+ _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
+#define v128_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+ _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
+#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 000000000..1896374ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./v256_intrinsics_c.h"
+#include "./v128_intrinsics.h"
+#include "./v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
+ return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+
+typedef uint32_t sad256_internal;
+SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ return c_v256_sad_u8_sum(s);
+}
+typedef uint32_t ssd256_internal;
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ return c_v256_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return c_v256_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return c_v256_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return c_v256_shr_s32(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
+ return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
+ return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
+ return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
+ return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
+ return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
+ return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
+ return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
+ return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
+ return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
+ return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
+ return c_v256_shr_n_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
new file mode 100644
index 000000000..ba4ed719d
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include "./v256_intrinsics_v128.h"
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 000000000..f96ca7fa6
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_C_H
+#define _V256_INTRINSICS_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./v128_intrinsics_c.h"
+#include "./aom_config.h"
+
+typedef union {
+ uint8_t u8[32];
+ uint16_t u16[16];
+ uint32_t u32[8];
+ uint64_t u64[4];
+ int8_t s8[32];
+ int16_t s16[16];
+ int32_t s32[8];
+ int64_t s64[4];
+ c_v64 v64[4];
+ c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+ c_v256 t;
+ t.v128[1] = hi;
+ t.v128[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+ uint64_t d) {
+ c_v256 t;
+ t.u64[3] = a;
+ t.u64[2] = b;
+ t.u64[1] = c;
+ t.u64[0] = d;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+ c_v256 t;
+ t.u64[3] = a.u64;
+ t.u64[2] = b.u64;
+ t.u64[1] = c.u64;
+ t.u64[0] = d.u64;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+ c_v256 t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 32; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+ abort();
+ }
+ return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 32; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+ abort();
+ }
+ c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero() {
+ c_v256 t;
+ t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+ return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+ return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+ c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+ return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef uint32_t c_sad256_internal;
+
+SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++)
+ s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+ c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+ c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+ c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+ c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+ c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+ c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+ c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+ c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+ c_v256 t;
+ t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+ t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+ t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+ t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+ t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+ t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+ t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+ t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+ c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+ c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+ c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+ c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+ c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
+ c_v128_ssub_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+ c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+ c_v128 lo_bits = c_v128_mullo_s16(a, b);
+ c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+ return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+ c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+ c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+ c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+ c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+ c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+ c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+ c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+ c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+ c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+ c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+ c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+ c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+ c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+ c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+ c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+ c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+ c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+ c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+ c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+ c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+ c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+ c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+ c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = a.u8[i * 2 + 1];
+ t.u8[i + 16] = b.u8[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = b.u8[i * 2];
+ t.u8[i + 16] = a.u8[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+ : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+ : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = a.u16[i * 2 + 1];
+ t.u16[i + 8] = b.u16[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = b.u16[i * 2];
+ t.u16[i + 8] = a.u16[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+ : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+ : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ if (mode) {
+ t.u32[7] = b.u32[7];
+ t.u32[6] = b.u32[5];
+ t.u32[5] = b.u32[3];
+ t.u32[4] = b.u32[1];
+ t.u32[3] = a.u32[7];
+ t.u32[2] = a.u32[5];
+ t.u32[1] = a.u32[3];
+ t.u32[0] = a.u32[1];
+ } else {
+ t.u32[7] = a.u32[6];
+ t.u32[6] = a.u32[4];
+ t.u32[5] = a.u32[2];
+ t.u32[4] = a.u32[0];
+ t.u32[3] = b.u32[6];
+ t.u32[2] = b.u32[4];
+ t.u32[1] = b.u32[2];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+ : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+ : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+ c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+ c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
+ c_v128_unpacklo_s8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
+ c_v128_unpacklo_s8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+ c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+ c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+ c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+ c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+ c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+ c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+ c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+ c_v256 t;
+ int c;
+ for (c = 0; c < 32; c++) {
+ if (pattern.u8[c] & ~31) {
+ fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
+ c);
+ abort();
+ }
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+ : pattern.u8[c] & 31];
+ }
+ return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+ return c_v256_from_v128(
+ c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+ c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+ if (n < 16)
+ return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+ c_v128_shr_n_byte(a.v128[0], 16 - n)),
+ c_v128_shl_n_byte(a.v128[0], n));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+ c_v128_zero());
+ else
+ return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+ if (n < 16)
+ return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+ c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+ c_v128_shl_n_byte(a.v128[1], 16 - n)));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_zero(),
+ c_v128_shr_n_byte(a.v128[1], n - 16));
+ else
+ return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
+ if (SIMD_CHECK && c > 31) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+ : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+ c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+ c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+ c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+ c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+ c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+ c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+ c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+ c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+ c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
+ return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
+ return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
+ return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
+ return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
+ return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
+ return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
+ return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
+ return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
+ return c_v256_shr_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 000000000..a4b334ea6
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_V128_H
+#define _V256_INTRINSICS_V128_H
+
+#if HAVE_NEON
+#include "./v128_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "./v128_intrinsics_x86.h"
+#else
+#include "./v128_intrinsics.h"
+#endif
+
+typedef struct { v128 lo, hi; } v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ v256 t;
+ t.hi = hi;
+ t.lo = lo;
+ return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+ v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+ v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ v128_store_unaligned(p, a.lo);
+ v128_store_unaligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ v128_store_aligned(p, a.lo);
+ v128_store_aligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE v256 v256_zero() {
+ return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+ v128 t = v128_dup_8(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+ v128 t = v128_dup_16(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+ v128 t = v128_dup_32(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+}
+
+typedef struct {
+ sad128_internal hi;
+ sad128_internal lo;
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+ sad256_internal t;
+ t.hi = v128_sad_u8_init();
+ t.lo = v128_sad_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ sad256_internal t;
+ t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
+ t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+}
+
+typedef struct {
+ ssd128_internal hi;
+ ssd128_internal lo;
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+ ssd256_internal t;
+ t.hi = v128_ssd_u8_init();
+ t.lo = v128_ssd_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ ssd256_internal t;
+ t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
+ t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+ return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+ return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+ return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+ return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+ return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+ return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+ return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+ return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+ return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return v256_from_v128(a.lo, b.lo);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return v256_from_v128(a.hi, b.hi);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
+ v128_unziplo_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
+ v128_unziphi_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
+ v128_unziplo_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
+ v128_unziphi_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
+ v128_pack_s32_s16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
+ v128_pack_s16_u8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
+ v128_pack_s16_s8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
+ v128_unpacklo_u16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
+ v128_unpacklo_s16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
+ v128_unpacklo_u16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
+ v128_unpacklo_s16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
+ v128 masklo = v128_cmplt_s8(pattern.lo, c16);
+ return v256_from_v128(
+ v128_or(
+ v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
+ v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
+ v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
+ v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
+ masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return v256_from_v128(
+ v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+ v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \
+ v128_shr_n_byte(a.lo, 16 - (n))), \
+ v128_shl_n_byte(a.lo, (n))) \
+ : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \
+ v128_zero()))
+
+#define v256_shr_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \
+ v128_or(v128_shr_n_byte(a.lo, n), \
+ v128_shl_n_byte(a.hi, 16 - (n)))) \
+ : v256_from_v128(v128_zero(), \
+ (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi))
+
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+ v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+#define v256_shl_n_16(a, n) \
+ v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+#define v256_shl_n_32(a, n) \
+ v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+#define v256_shr_n_u8(a, n) \
+ v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+#define v256_shr_n_u16(a, n) \
+ v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+#define v256_shr_n_u32(a, n) \
+ v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+#define v256_shr_n_s8(a, n) \
+ v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+#define v256_shr_n_s16(a, n) \
+ v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+#define v256_shr_n_s32(a, n) \
+ v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+
+#endif /* _V256_INTRINSICS_V128_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 000000000..b82daab68
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#if !defined(__AVX2__)
+
+#include "./v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
+ defined(__AVX2__) && defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+#include "./v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+ return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+ return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE v128 v256_low_v128(v256 a) {
+ return _mm256_extracti128_si256(a, 0);
+}
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+ return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+ // gcc seems to be missing _mm256_set_m128i()
+ return _mm256_insertf128_si256(
+ _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+ return _mm256_subs_epu16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
+ v128_unpacklo_u8_s16(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
+ v128_unpacklo_u8_s16(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)),
+ v128_unpacklo_s8_s16(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)),
+ v128_unpacklo_s8_s16(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
+ v128_unpacklo_u16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
+ v128_unpacklo_s16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
+ v128_unpacklo_u16_s32(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
+ v128_unpacklo_s16_s32(v256_high_v128(a)));
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 hi = v256_high_v128(pattern);
+ v128 lo = v256_low_v128(pattern);
+ v128 maskhi = v128_cmplt_s8(hi, c16);
+ v128 masklo = v128_cmplt_s8(lo, c16);
+ return v256_from_v128(
+ v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
+ v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
+ maskhi)),
+ v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
+ v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
+ masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+ v128 t;
+ r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+ _mm256_cvtepi32_epi64(v256_low_v128(r)));
+ t = v256_low_v128(_mm256_add_epi64(
+ r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+ return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+ v128 l = v256_low_v128(r);
+ v128 h = v256_high_v128(r);
+ return (int64_t)_mm_cvtsi128_si32(l) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+ (int64_t)_mm_cvtsi128_si32(h) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+ v128 lo = v256_low_v128(t);
+ v128 hi = v256_high_v128(t);
+ lo = v128_add_32(lo, hi);
+ return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_sum().
+ The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+ v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+ v256 rl = _mm256_madd_epi16(l, l);
+ v256 rh = _mm256_madd_epi16(h, h);
+ v128 c = _mm_cvtsi32_si128(32);
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+ return _mm256_add_epi64(
+ s,
+ _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return _mm256_sub_epi8(
+ _mm256_avg_epu8(a, b),
+ _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+ _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
+ _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ __m128i x = _mm_cvtsi32_si128(c + 8);
+ return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+ _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 \
+ ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \
+ v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
+ v128_shl_n_byte(v256_low_v128(a), n)) \
+ : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \
+ v128_zero()))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n) \
+ ((n) < 16 \
+ ? _mm256_alignr_epi8( \
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+ : ((n) > 16 \
+ ? _mm256_srli_si256( \
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
+ (n)-16) \
+ : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+
+#define v256_shl_n_8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+ _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c) \
+ _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+ _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#endif
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
new file mode 100644
index 000000000..ee2b683a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V64_INTRINSICS_H
+#define _V64_INTRINSICS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./v64_intrinsics_c.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v64 v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
+SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
+SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+ return c_v64_from_32(x, y);
+}
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+ return c_v64_from_16(a, b, c, d);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+ return c_u32_load_unaligned(p);
+}
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+ return c_u32_load_aligned(p);
+}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+ c_u32_store_unaligned(p, a);
+}
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+ c_u32_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+ return c_v64_load_unaligned(p);
+}
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+ return c_v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+ c_v64_store_unaligned(p, a);
+}
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+ c_v64_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_align(v64 a, v64 b, c) { return c_v64_align(a, b, c); }
+
+SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
+SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+ return c_v64_pack_s32_s16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+ return c_v64_pack_s16_u8(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+ return c_v64_pack_s16_s8(a, b);
+}
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+ return c_v64_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+ return c_v64_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+ return c_v64_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+ return c_v64_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
+ return c_v64_shuffle_8(a, pattern);
+}
+
+typedef uint32_t sad64_internal;
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+ return c_v64_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+ return c_v64_sad_u8_sum(s);
+}
+typedef uint32_t ssd64_internal;
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+ return c_v64_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+ return c_v64_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
+ return c_v64_shr_u16(a, n);
+}
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
+ return c_v64_shr_s16(a, n);
+}
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
+ return c_v64_shr_u32(a, n);
+}
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
+ return c_v64_shr_s32(a, n);
+}
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
+ return c_v64_shr_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
+ return c_v64_shl_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+ return c_v64_shl_n_8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+ return c_v64_shr_n_u8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+ return c_v64_shr_n_s8(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+ return c_v64_shl_n_16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+ return c_v64_shr_n_u16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+ return c_v64_shr_n_s16(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+ return c_v64_shl_n_32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+ return c_v64_shr_n_u32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+ return c_v64_shr_n_s32(a, c);
+}
+
+#endif /* _V64_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
new file mode 100644
index 000000000..c7574eef5
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V64_INTRINSICS_H
+#define _V64_INTRINSICS_H
+
+#include <arm_neon.h>
+#include "./v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
+
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
+#endif
+
+typedef int64x1_t v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+ return vget_lane_u32(vreinterpret_u32_s64(a), 0);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+ return vget_lane_u32(vreinterpret_u32_s64(a), 1);
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) {
+ return vget_lane_s32(vreinterpret_s32_s64(a), 0);
+}
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+ return vget_lane_s32(vreinterpret_s32_s64(a), 1);
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+ return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
+ d);
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+ return vcreate_s64((uint64_t)x << 32 | y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
+
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+ return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+ return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+ *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+#if defined(__clang__)
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+ 0);
+#elif defined(__CC_ARM)
+ *(__packed uint32_t *)p) = a;
+#elif defined(__GNUC__)
+ *((__attribute((packed)) uint32_t *)p) = a;
+#else
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+ 0);
+#endif
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+ return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+ return v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
+ vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
+ vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+ return c ? vreinterpret_s64_s8(
+ vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
+ : b;
+#else
+ return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
+ : b;
+#endif
+}
+
+SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) {
+ return vreinterpret_s64_u8(vdup_n_u8(x));
+}
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) {
+ return vreinterpret_s64_u16(vdup_n_u16(x));
+}
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) {
+ return vreinterpret_s64_u32(vdup_n_u32(x));
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
+ int64x2_t r = vpaddlq_s32(vpaddlq_s16(
+ vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
+ vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
+ return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+ int64x2_t r =
+ vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+ return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+ return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+ return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+}
+
+typedef uint16x8_t sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
+
+/* Implementation dependent return value. Result must be finalised with
+ v64_sad_u8_sum().
+ The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+ return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+ uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
+ return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+}
+
+typedef int64x1_t ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
+ return (ssd64_internal)(uint64_t)0;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+ uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+ uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
+ return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+ return (uint32_t)(uint64_t)s;
+}
+
+SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
+
+SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
+
+SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
+
+SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
+
+SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
+ return vreinterpret_s64_u32(
+ vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
+ return vreinterpret_s64_u16(
+ vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
+ return vreinterpret_s64_s8(
+ vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
+ return vreinterpret_s64_s32(
+ vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_abs_s16(v64 x) {
+ return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 x) {
+ return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
+}
+
+SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
+ vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+}
+
+SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
+ return vreinterpret_s64_s32(
+ vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
+ int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
+ return vreinterpret_s64_s32(
+ vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
+ vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
+}
+
+SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
+ return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
+ vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
+ vreinterpret_s8_s64(y)),
+ vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
+ return vreinterpret_s64_u16(
+ vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
+ return vreinterpret_s64_s8(
+ vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
+ return vreinterpret_s64_s8(
+ vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(
+ vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+ uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+ return vreinterpret_s64_u8(r.val[0]);
+}
+
+SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+ uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+ return vreinterpret_s64_u8(r.val[1]);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+ int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+ return vreinterpret_s64_s16(r.val[0]);
+}
+
+SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+ int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+ return vreinterpret_s64_s16(r.val[1]);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+ int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+ return vreinterpret_s64_s32(r.val[0]);
+}
+
+SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+ int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+ return vreinterpret_s64_s32(r.val[1]);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+ return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+ return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+ return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+ return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
+ return vreinterpret_s64_s16(vqmovn_s32(
+ vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
+ vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
+ return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
+ vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+ uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+ return vreinterpret_s64_u8(r.val[0]);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+ uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+ return vreinterpret_s64_u8(r.val[1]);
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+ uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+ return vreinterpret_s64_u16(r.val[0]);
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+ uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+ return vreinterpret_s64_u16(r.val[1]);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
+ return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
+ return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
+ return vreinterpret_s64_s32(
+ vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
+ return vreinterpret_s64_u32(
+ vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+ return vreinterpret_s64_u8(
+ vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
+ return vreinterpret_s64_u8(
+ vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
+ return vreinterpret_s64_u16(
+ vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
+ return vreinterpret_s64_u16(
+ vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
+ return vreinterpret_s64_u16(
+ vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+ return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+ return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+ return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+ return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+ return vreinterpret_s64_u16(
+ vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+ return vreinterpret_s64_s16(
+ vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+ return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+ return vreinterpret_s64_u32(
+ vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+ return vreinterpret_s64_s32(
+ vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+ return vshl_n_s64(a, c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+ return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+ return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+ return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+ return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+ return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+ return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+ return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+ return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+ return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+ return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
+}
+
+#else
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+ return v64_from_64(v64_u64(a) << c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+ return v64_from_64(v64_u64(a) >> c * 8);
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+ return v64_shr_u16(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+ return v64_shr_s16(a, c);
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+ return v64_shr_u32(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+ return v64_shr_s32(a, c);
+}
+
+#endif
+
+#endif /* _V64_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
new file mode 100644
index 000000000..5032238b6
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V64_INTRINSICS_C_H
+#define _V64_INTRINSICS_C_H
+
+/* Note: This implements the intrinsics in plain, unoptimised C.
+ Intended for reference, porting or debugging. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./aom_config.h"
+
+typedef union {
+ uint8_t u8[8];
+ uint16_t u16[4];
+ uint32_t u32[2];
+ uint64_t u64;
+ int8_t s8[8];
+ int16_t s16[4];
+ int32_t s32[2];
+ int64_t s64;
+} c_v64;
+
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
+
+SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
+ return a.u32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
+
+SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
+ return a.s32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
+ c_v64 t;
+ t.u32[!CONFIG_BIG_ENDIAN] = x;
+ t.u32[CONFIG_BIG_ENDIAN] = y;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
+ c_v64 t;
+ t.u64 = x;
+ return t;
+}
+
+SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
+
+SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
+ uint16_t d) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ t.u16[0] = a;
+ t.u16[1] = b;
+ t.u16[2] = c;
+ t.u16[3] = d;
+ } else {
+ t.u16[3] = a;
+ t.u16[2] = b;
+ t.u16[1] = c;
+ t.u16[0] = d;
+ }
+ return t;
+}
+
+SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
+ uint32_t t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 4; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 4; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 3) {
+ fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
+ abort();
+ }
+ return c_u32_load_unaligned(p);
+}
+
+SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
+ if (SIMD_CHECK && (uintptr_t)p & 3) {
+ fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
+ abort();
+ }
+ c_u32_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
+ c_v64 t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 8; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 7) {
+ fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
+ abort();
+ }
+ return c_v64_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
+ uint8_t *q = (uint8_t *)p;
+ uint8_t *r = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 8; c++) q[c] = r[c];
+}
+
+SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 7) {
+ fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
+ abort();
+ }
+ c_v64_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_zero() {
+ c_v64 t;
+ t.u64 = 0;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
+ c_v64 t;
+ t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
+ t.u8[7] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
+ c_v64 t;
+ t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
+ c_v64 t;
+ t.u32[0] = t.u32[1] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
+ ? 32767
+ : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
+ ? -32768
+ : (int32_t)a.s16[c] + (int32_t)b.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
+ t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++)
+ t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) {
+ int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
+ t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
+ ? -32768
+ : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
+ ? 32767
+ : (int32_t)a.s16[c] - (int32_t)b.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.u16[c] =
+ (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
+ t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u8[7] = a.u8[7];
+ t.u8[6] = b.u8[7];
+ t.u8[5] = a.u8[6];
+ t.u8[4] = b.u8[6];
+ t.u8[3] = a.u8[5];
+ t.u8[2] = b.u8[5];
+ t.u8[1] = a.u8[4];
+ t.u8[0] = b.u8[4];
+ } else {
+ t.u8[7] = a.u8[3];
+ t.u8[6] = b.u8[3];
+ t.u8[5] = a.u8[2];
+ t.u8[4] = b.u8[2];
+ t.u8[3] = a.u8[1];
+ t.u8[2] = b.u8[1];
+ t.u8[1] = a.u8[0];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u16[3] = a.u16[3];
+ t.u16[2] = b.u16[3];
+ t.u16[1] = a.u16[2];
+ t.u16[0] = b.u16[2];
+ } else {
+ t.u16[3] = a.u16[1];
+ t.u16[2] = b.u16[1];
+ t.u16[1] = a.u16[0];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u32[1] = a.u32[1];
+ t.u32[0] = b.u32[1];
+ } else {
+ t.u32[1] = a.u32[0];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u8[7] = b.u8[7];
+ t.u8[6] = b.u8[5];
+ t.u8[5] = b.u8[3];
+ t.u8[4] = b.u8[1];
+ t.u8[3] = a.u8[7];
+ t.u8[2] = a.u8[5];
+ t.u8[1] = a.u8[3];
+ t.u8[0] = a.u8[1];
+ } else {
+ t.u8[7] = a.u8[6];
+ t.u8[6] = a.u8[4];
+ t.u8[5] = a.u8[2];
+ t.u8[4] = a.u8[0];
+ t.u8[3] = b.u8[6];
+ t.u8[2] = b.u8[4];
+ t.u8[1] = b.u8[2];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u16[3] = b.u16[3];
+ t.u16[2] = b.u16[1];
+ t.u16[1] = a.u16[3];
+ t.u16[0] = a.u16[1];
+ } else {
+ t.u16[3] = a.u16[2];
+ t.u16[2] = a.u16[0];
+ t.u16[1] = b.u16[2];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
+ : _c_v64_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
+ : _c_v64_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.u8[3 + endian];
+ t.s16[2] = (int16_t)a.u8[2 + endian];
+ t.s16[1] = (int16_t)a.u8[1 + endian];
+ t.s16[0] = (int16_t)a.u8[0 + endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.u8[7 - endian];
+ t.s16[2] = (int16_t)a.u8[6 - endian];
+ t.s16[1] = (int16_t)a.u8[5 - endian];
+ t.s16[0] = (int16_t)a.u8[4 - endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.s8[3 + endian];
+ t.s16[2] = (int16_t)a.s8[2 + endian];
+ t.s16[1] = (int16_t)a.s8[1 + endian];
+ t.s16[0] = (int16_t)a.s8[0 + endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.s8[7 - endian];
+ t.s16[2] = (int16_t)a.s8[6 - endian];
+ t.s16[1] = (int16_t)a.s8[5 - endian];
+ t.s16[0] = (int16_t)a.s8[4 - endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
+ t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
+ t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
+ t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
+ t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
+ t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
+ t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
+ t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
+ t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
+ t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
+ t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
+ t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
+ t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
+ t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
+ t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
+ t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
+ t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
+ t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) {
+ if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
+ fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
+ pattern.u8[c], c);
+ abort();
+ }
+ t.u8[c] =
+ a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
+ }
+ return t;
+}
+
+SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
+ return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
+ a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
+ a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
+ return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
+ (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
+}
+
+SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
+ return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
+ a.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
+ return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
+}
+
+typedef uint32_t c_sad64_internal;
+
+/* Implementation dependent return value. Result must be finalised with
+ v64_sad_u8_sum().
+ The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
+ c_v64 b) {
+ int c;
+ for (c = 0; c < 8; c++)
+ s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
+
+typedef uint32_t c_ssd64_internal;
+
+/* Implementation dependent return value. Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
+
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
+ c_v64 b) {
+ int c;
+ for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
+
+SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 | b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 ^ b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 & b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 & ~b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
+ t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
+ t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int32_t u;
+ u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
+ t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+ u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
+ t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+ u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
+ t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+ u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
+ t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
+ abort();
+ }
+ t.u32[1] = a.u32[1] << n;
+ t.u32[0] = a.u32[0] << n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
+ abort();
+ }
+ t.u32[1] = a.u32[1] >> n;
+ t.u32[0] = a.u32[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
+ abort();
+ }
+ t.s32[1] = a.s32[1] >> n;
+ t.s32[0] = a.s32[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
+ c_v64 t;
+ t.u64 = x.u64 >> i * 8;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
+ c_v64 t;
+ t.u64 = x.u64 << i * 8;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
+ if (SIMD_CHECK && c > 7) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
+ return c_v64_shl_8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
+ return c_v64_shr_u8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
+ return c_v64_shr_s8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
+ return c_v64_shl_16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
+ return c_v64_shr_u16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
+ return c_v64_shr_s16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
+ return c_v64_shl_32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
+ return c_v64_shr_u32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
+ return c_v64_shr_s32(a, c);
+}
+
+#endif /* _V64_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
new file mode 100644
index 000000000..8dcc9f6fc
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V64_INTRINSICS_H
+#define _V64_INTRINSICS_H
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128i v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+ return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+ return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+ return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+ return _mm_packs_epi32(
+ _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
+ _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+ return _mm_set_epi32(0, 0, x, y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+ return _mm_cvtsi64_si128(x);
+#else
+ return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_u64(v64 x) {
+ return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
+}
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+ return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+ return *((uint32_t *)p);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+ *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+ *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+ return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+ return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+ _mm_storel_epi64((__m128i *)p, a);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+ _mm_storel_epi64((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#define v64_align(a, b, c) \
+ ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
+#else
+#define v64_align(a, b, c) \
+ ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
+ : (b))
+#endif
+
+SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v64 v64_abs_s16(v64 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi16(a);
+#else
+ return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+ return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packs_epi32(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packus_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packs_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0f0d0b0907050301LL));
+#else
+ return _mm_packus_epi16(
+ _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
+ _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0e0c0a0806040200LL));
+#else
+ return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0f0e0b0a07060302LL));
+#else
+ return _mm_packs_epi32(
+ _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
+ _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0d0c090805040100LL));
+#else
+ return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+ return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+ return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+ return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+ return _mm_srli_si128(
+ _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(x, pattern);
+#else
+ v64 output;
+ unsigned char *input = (unsigned char *)&x;
+ unsigned char *index = (unsigned char *)&pattern;
+ char *selected = (char *)&output;
+ int counter;
+
+ for (counter = 0; counter < 8; counter++) {
+ selected[counter] = input[index[counter]];
+ }
+
+ return output;
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
+ __m128i r, r1, r2, z;
+ z = _mm_setzero_si128();
+ r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
+ _mm_unpacklo_epi8(b, z));
+ r2 = _mm_srli_si128(r1, 8);
+ r = _mm_add_epi32(r1, r2);
+ r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+ return ((int32_t)v64_low_u32(r)) >> 8;
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
+ __m128i r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+ __m128i x = _mm_cvtepi32_epi64(r);
+ return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
+#else
+ return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(r);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
+ return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+ return v64_dotp_s16(a, v64_dup_16(1));
+}
+
+typedef v64 sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ v64_sad_u8_sum().
+ The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+ return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+typedef v64 ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+ v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
+ v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
+ v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
+ return _mm_add_epi64(
+ s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_mullo_epi32(a, b);
+#else
+ return _mm_unpacklo_epi32(
+ _mm_mul_epu32(a, b),
+ _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
+#endif
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_maddubs_epi16(a, b);
+#else
+ __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
+ return _mm_packs_epi32(t, t);
+#endif
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
+ return _mm_sub_epi8(_mm_avg_epu8(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_min_epi8(a, b);
+#else
+ v64 mask = _mm_cmplt_epi8(a, b);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_max_epi8(a, b);
+#else
+ v64 mask = _mm_cmplt_epi8(b, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+ return _mm_packs_epi16(
+ _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
+#define v64_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
+#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif /* _V64_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
new file mode 100644
index 000000000..141bf01c7
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 16; i++, s += sp, r += rp) {
+ for (j = 0; j < 16; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+ uint32_t bd) {
+ int64_t ssim_n, ssim_d;
+ int64_t c1, c2;
+ if (bd == 8) {
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+ } else if (bd == 10) {
+ c1 = (cc1_10 * count * count) >> 12;
+ c2 = (cc2_10 * count * count) >> 12;
+ } else if (bd == 12) {
+ c1 = (cc1_12 * count * count) >> 12;
+ c2 = (cc2_12 * count * count) >> 12;
+ } else {
+ c1 = c2 = 0;
+ assert(0);
+ }
+
+ ssim_n = (2 * sum_s * sum_r + c1) *
+ ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+ ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+ ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+ (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t bd, uint32_t shift) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+ sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width,
+ int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width,
+ int height, uint32_t bd, uint32_t shift) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+ shift);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight) {
+ double a, b, c;
+ double ssimv;
+
+ a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width, source->y_crop_height);
+
+ b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
+
+ c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+ // Since these variables are unsigned sums, convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side. check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+ const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+ // Since these variables are unsigned, sums convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, Ssimv *sv) {
+ aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+ &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency) {
+ double dssim_total = 0;
+ double ssim_total = 0;
+ double ssim2_total = 0;
+ double inconsistency_total = 0;
+ int i, j;
+ int c = 0;
+ double norm;
+ double old_ssim_total = 0;
+ aom_clear_system_state();
+ // We can sample points as frequently as we like start with 1 per 4x4.
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4, ++c) {
+ Ssimv sv = { 0 };
+ double ssim;
+ double ssim2;
+ double dssim;
+ uint32_t var_new;
+ uint32_t var_old;
+ uint32_t mean_new;
+ uint32_t mean_old;
+ double ssim_new;
+ double ssim_old;
+
+ // Not sure there's a great way to handle the edge pixels
+ // in ssim when using a window. Seems biased against edge pixels
+ // however you handle this. This uses only samples that are
+ // fully in the frame.
+ if (j + 8 <= width && i + 8 <= height) {
+ ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+ }
+
+ ssim = ssimv_similarity(&sv, 64);
+ ssim2 = ssimv_similarity2(&sv, 64);
+
+ sv.ssim = ssim2;
+
+ // dssim is calculated to use as an actual error metric and
+ // is scaled up to the same range as sum square error.
+ // Since we are subsampling every 16th point maybe this should be
+ // *16 ?
+ dssim = 255 * 255 * (1 - ssim2) / 2;
+
+ // Here I introduce a new error metric: consistency-weighted
+ // SSIM-inconsistency. This metric isolates frames where the
+ // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+ // sharper or blurrier than the others. Higher values indicate a
+ // temporally inconsistent SSIM. There are two ideas at work:
+ //
+ // 1) 'SSIM-inconsistency': the total inconsistency value
+ // reflects how much SSIM values are changing between this
+ // source / reference frame pair and the previous pair.
+ //
+ // 2) 'consistency-weighted': weights de-emphasize areas in the
+ // frame where the scene content has changed. Changes in scene
+ // content are detected via changes in local variance and local
+ // mean.
+ //
+ // Thus the overall measure reflects how inconsistent the SSIM
+ // values are, over consistent regions of the frame.
+ //
+ // The metric has three terms:
+ //
+ // term 1 -> uses change in scene Variance to weight error score
+ // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term 2 -> uses change in local scene luminance to weight error
+ // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term3 -> measures inconsistency in ssim scores between frames
+ // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+ //
+ // This term compares the ssim score for the same location in 2
+ // subsequent frames.
+ var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+ var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+ mean_new = sv.sum_s;
+ mean_old = sv2[c].sum_s;
+ ssim_new = sv.ssim;
+ ssim_old = sv2[c].ssim;
+
+ if (do_inconsistency) {
+ // We do the metric once for every 4x4 block in the image. Since
+ // we are scaling the error to SSE for use in a psnr calculation
+ // 1.0 = 4x4x255x255 the worst error we can possibly have.
+ static const double kScaling = 4. * 4 * 255 * 255;
+
+ // The constants have to be non 0 to avoid potential divide by 0
+ // issues other than that they affect kind of a weighting between
+ // the terms. No testing of what the right terms should be has been
+ // done.
+ static const double c1 = 1, c2 = 1, c3 = 1;
+
+ // This measures how much consistent variance is in two consecutive
+ // source frames. 1.0 means they have exactly the same variance.
+ const double variance_term =
+ (2.0 * var_old * var_new + c1) /
+ (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+ // This measures how consistent the local mean are between two
+ // consecutive frames. 1.0 means they have exactly the same mean.
+ const double mean_term =
+ (2.0 * mean_old * mean_new + c2) /
+ (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+ // This measures how consistent the ssims of two
+ // consecutive frames is. 1.0 means they are exactly the same.
+ double ssim_term =
+ pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
+
+ double this_inconsistency;
+
+ // Floating point math sometimes makes this > 1 by a tiny bit.
+ // We want the metric to scale between 0 and 1.0 so we can convert
+ // it to an snr scaled value.
+ if (ssim_term > 1) ssim_term = 1;
+
+ // This converts the consistency metric to an inconsistency metric
+ // ( so we can scale it like psnr to something like sum square error.
+ // The reason for the variance and mean terms is the assumption that
+ // if there are big changes in the source we shouldn't penalize
+ // inconsistency in ssim scores a bit less as it will be less visible
+ // to the user.
+ this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+ this_inconsistency *= kScaling;
+ inconsistency_total += this_inconsistency;
+ }
+ sv2[c] = sv;
+ ssim_total += ssim;
+ ssim2_total += ssim2;
+ dssim_total += dssim;
+
+ old_ssim_total += ssim_old;
+ }
+ old_ssim_total += 0;
+ }
+
+ norm = 1. / (width / 4) / (height / 4);
+ ssim_total *= norm;
+ ssim2_total *= norm;
+ m->ssim2 = ssim2_total;
+ m->ssim = ssim_total;
+ if (old_ssim_total == 0) inconsistency_total = 0;
+
+ m->ssimc = inconsistency_total;
+
+ m->dssim = dssim_total;
+ return inconsistency_total;
+}
+
+#if CONFIG_HIGHBITDEPTH
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd) {
+ double a, b, c;
+ double ssimv;
+ uint32_t shift = 0;
+
+ assert(bd >= in_bd);
+ shift = bd - in_bd;
+
+ a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, shift);
+
+ b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
new file mode 100644
index 000000000..902735e50
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_SSIM_H_
+#define AOM_DSP_SSIM_H_
+
+#define MAX_SSIM_DB 100.0;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./aom_config.h"
+#include "aom_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+ // source sum ( over 8x8 region )
+ uint32_t sum_s;
+
+ // reference sum (over 8x8 region )
+ uint32_t sum_r;
+
+ // source sum squared ( over 8x8 region )
+ uint32_t sum_sq_s;
+
+ // reference sum squared (over 8x8 region )
+ uint32_t sum_sq_r;
+
+ // sum of source times reference (over 8x8 region)
+ uint32_t sum_sxr;
+
+ // calculated ssim score between source and reference
+ double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+ // ssim consistency error metric ( see code for explanation )
+ double ssimc;
+
+ // standard ssim
+ double ssim;
+
+ // revised ssim ( see code for explanation)
+ double ssim2;
+
+ // ssim restated as an error metric like sse
+ double dssim;
+
+ // dssim converted to decibels
+ double dssimd;
+
+ // ssimc converted to decibels
+ double ssimcd;
+} Metrics;
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency);
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight);
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd);
+
+#if CONFIG_HIGHBITDEPTH
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd);
+#endif // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
new file mode 100644
index 000000000..8dda96efb
--- /dev/null
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r, c;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride, int bd) {
+ int r, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ (void)bd;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
new file mode 100644
index 000000000..b9155fdc0
--- /dev/null
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
+ int height) {
+ int r, c;
+ uint64_t ss = 0;
+
+ for (r = 0; r < height; r++) {
+ for (c = 0; c < width; c++) {
+ const int16_t v = src[c];
+ ss += v * v;
+ }
+ src += src_stride;
+ }
+
+ return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+ uint64_t ss = 0;
+ do {
+ const int16_t v = *src++;
+ ss += v * v;
+ } while (--n);
+
+ return ss;
+}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
new file mode 100644
index 000000000..a5e964aad
--- /dev/null
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_TXFM_COMMON_H_
+#define AOM_DSP_TXFM_COMMON_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// Constants:
+// for (int i = 1; i< 32; ++i)
+// printf("static const int cospi_%d_64 = %.0f;\n", i,
+// round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+
+#endif // AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
new file mode 100644
index 000000000..9fc0db783
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.c
@@ -0,0 +1,1249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride) {
+ int distortion = 0;
+ int r, c;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ int diff = a[c] - b[c];
+ distortion += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ return distortion;
+}
+
+uint32_t aom_get_mb_ss_c(const int16_t *a) {
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < 256; ++i) {
+ sum += a[i] * a[i];
+ }
+
+ return sum;
+}
+
+uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
+}
+
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
+ uint32_t sse;
+ int sum;
+ variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
+ return sse;
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+#define VAR(W, H) \
+ uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t aom_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+ }
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ \
+ return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ }
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+ void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ int *sum) { \
+ variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+ }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+ uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+ VAR(W, H) \
+ SUBPIX_VAR(W, H) \
+ SUBPIX_AVG_VAR(W, H)
+
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+VARIANCES(4, 2)
+VARIANCES(2, 4)
+VARIANCES(2, 2)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
+ const uint8_t *ref, int ref_stride) {
+ int i, j, k;
+ int stride = ref_stride << 3;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0, k = 0; j < width; j++, k += 8) {
+ comp_pred[j] = ref[k];
+ }
+ comp_pred += width;
+ ref += stride;
+ }
+}
+
+void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride) {
+ int i, j;
+ int stride = ref_stride << 3;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = ref[(j << 3)] + pred[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += stride;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int w, int h) {
+ uint64_t sse;
+ int64_t sum;
+ highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
+ return sse;
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)sse_long;
+ *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+ uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_GET_VAR(S) \
+ void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ } \
+ \
+ void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ } \
+ \
+ void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+ }
+
+#define HIGHBD_MSE(W, H) \
+ uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+HIGHBD_VARIANCES(4, 2)
+HIGHBD_VARIANCES(2, 4)
+HIGHBD_VARIANCES(2, 2)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
+ const uint8_t *ref8, int ref_stride) {
+ int i, j;
+ int stride = ref_stride << 3;
+
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ comp_pred[j] = ref[(j << 3)];
+ }
+ comp_pred += width;
+ ref += stride;
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
+ const uint8_t *pred8, int width,
+ int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ int stride = ref_stride << 3;
+
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[(j << 3)];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += stride;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, const uint8_t *m, int m_stride, int w, int h,
+ unsigned int *sse, int *sum) {
+ int i, j;
+
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = (a[j] - b[j]) * (m[j]);
+ sum64 += diff;
+ sse64 += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+ sum64 = (sum64 >= 0) ? sum64 : -sum64;
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
+}
+
+#define MASK_VAR(W, H) \
+ unsigned int aom_masked_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ int sum; \
+ masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define MASK_SUBPIX_VAR(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \
+ msk_stride, sse); \
+ }
+
+MASK_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 4)
+
+MASK_VAR(4, 8)
+MASK_SUBPIX_VAR(4, 8)
+
+MASK_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 4)
+
+MASK_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 8)
+
+MASK_VAR(8, 16)
+MASK_SUBPIX_VAR(8, 16)
+
+MASK_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 8)
+
+MASK_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 16)
+
+MASK_VAR(16, 32)
+MASK_SUBPIX_VAR(16, 32)
+
+MASK_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 16)
+
+MASK_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 32)
+
+MASK_VAR(32, 64)
+MASK_SUBPIX_VAR(32, 64)
+
+MASK_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 32)
+
+MASK_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+MASK_VAR(64, 128)
+MASK_SUBPIX_VAR(64, 128)
+
+MASK_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 64)
+
+MASK_VAR(128, 128)
+MASK_SUBPIX_VAR(128, 128)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_HIGHBITDEPTH
+void highbd_masked_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m,
+ int m_stride, int w, int h, uint64_t *sse,
+ int64_t *sum) {
+ int i, j;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = (a[j] - b[j]) * (m[j]);
+ *sum += (int64_t)diff;
+ *sse += (int64_t)diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+ *sum = (*sum >= 0) ? *sum : -*sum;
+ *sum = ROUND_POWER_OF_TWO(*sum, 6);
+ *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, const uint8_t *m, int m_stride, int w,
+ int h, unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+ &sse64, &sum64);
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+ &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+ &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_MASK_VAR(W, H) \
+ unsigned int aom_highbd_masked_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ int sum; \
+ highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \
+ &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_masked_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \
+ sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_masked_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \
+ sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
+ unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_masked_variance##W##x##H##_c( \
+ CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+ } \
+ \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_10_masked_variance##W##x##H##_c( \
+ CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+ } \
+ \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_12_masked_variance##W##x##H##_c( \
+ CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+ }
+
+HIGHBD_MASK_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+
+HIGHBD_MASK_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+
+HIGHBD_MASK_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+
+HIGHBD_MASK_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+
+HIGHBD_MASK_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+
+HIGHBD_MASK_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+
+HIGHBD_MASK_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+
+HIGHBD_MASK_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+
+HIGHBD_MASK_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+
+HIGHBD_MASK_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+
+HIGHBD_MASK_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+
+HIGHBD_MASK_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+
+HIGHBD_MASK_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+
+HIGHBD_MASK_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+
+HIGHBD_MASK_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AV1 && CONFIG_EXT_INTER
+
+#if CONFIG_AV1 && CONFIG_MOTION_VAR
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ pre += pre_stride;
+ wsrc += w;
+ mask += w;
+ }
+}
+
+#define OBMC_VAR(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define OBMC_SUBPIX_VAR(W, H) \
+ unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
+ }
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int i, j;
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ pre += pre_stride;
+ wsrc += w;
+ mask += w;
+ }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H) \
+ unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
+ unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ wsrc, mask, sse); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
+ }
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
new file mode 100644
index 000000000..7c925cfac
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_VARIANCE_H_
+#define AOM_DSP_VARIANCE_H_
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride);
+
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *second_pred);
+
+typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+ int b_stride, int n);
+
+typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sad_array);
+
+typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *const b_array[],
+ int b_stride, unsigned int *sad_array);
+
+typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+ int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *msk_ptr,
+ int msk_stride);
+typedef unsigned int (*aom_masked_variance_fn_t)(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *msk, int msk_stride, unsigned int *sse);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse);
+#endif // CONFIG_AV1 && CONFIG_EXT_INTER
+
+#if CONFIG_AV1 && CONFIG_MOTION_VAR
+typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+ const int32_t *wsrc,
+ const int32_t *msk);
+typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
+ int pred_stride,
+ const int32_t *wsrc,
+ const int32_t *msk,
+ unsigned int *sse);
+typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
+ const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+ const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
+#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
+
+#if CONFIG_AV1
+typedef struct aom_variance_vtable {
+ aom_sad_fn_t sdf;
+ aom_sad_avg_fn_t sdaf;
+ aom_variance_fn_t vf;
+ aom_subpixvariance_fn_t svf;
+ aom_subp_avg_variance_fn_t svaf;
+ aom_sad_multi_fn_t sdx3f;
+ aom_sad_multi_fn_t sdx8f;
+ aom_sad_multi_d_fn_t sdx4df;
+#if CONFIG_EXT_INTER
+ aom_masked_sad_fn_t msdf;
+ aom_masked_variance_fn_t mvf;
+ aom_masked_subpixvariance_fn_t msvf;
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+ aom_obmc_sad_fn_t osdf;
+ aom_obmc_variance_fn_t ovf;
+ aom_obmc_subpixvariance_fn_t osvf;
+#endif // CONFIG_MOTION_VAR
+} aom_variance_fn_ptr_t;
+#endif // CONFIG_AV1
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter);
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h);
+
+#if CONFIG_HIGHBITDEPTH
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int w, int h);
+#endif // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 000000000..4067b0b53
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_avg_sse2;
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+filter8_1dfunction aom_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_, sse2);
+
+#if CONFIG_HIGHBITDEPTH && ARCH_X86_64
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ sse2);
+
+// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2);
+HIGH_FUN_CONV_2D(avg_, sse2);
+
+#if CONFIG_LOOP_RESTORATION
+// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
+// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
+void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ ((int16_t *)filter_x)[3] += 128;
+ ((int16_t *)filter_y)[3] += 128;
+ aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ ((int16_t *)filter_x)[3] -= 128;
+ ((int16_t *)filter_y)[3] -= 128;
+}
+#endif // CONFIG_LOOP_RESTORATION
+#endif // CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#endif // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
new file mode 100644
index 000000000..4d3142867
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -0,0 +1,345 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+%endif
+ mov r4d, dword wm
+%ifidn %2, highbd
+ shl r4d, 1
+ shl srcq, 1
+ shl src_strideq, 1
+ shl dstq, 1
+ shl dst_strideq, 1
+%else
+ cmp r4d, 4
+ je .w4
+%endif
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+%if CONFIG_AV1 && CONFIG_EXT_PARTITION
+ cmp r4d, 64
+ je .w64
+%ifidn %2, highbd
+ cmp r4d, 128
+ je .w128
+
+.w256:
+ mov r4d, dword hm
+.loop256:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ movu m0, [srcq+128]
+ movu m1, [srcq+128+16]
+ movu m2, [srcq+128+32]
+ movu m3, [srcq+128+48]
+%ifidn %1, avg
+ pavg m0, [dstq+128]
+ pavg m1, [dstq+128+16]
+ pavg m2, [dstq+128+32]
+ pavg m3, [dstq+128+48]
+%endif
+ mova [dstq+128 ], m0
+ mova [dstq+128+16], m1
+ mova [dstq+128+32], m2
+ mova [dstq+128+48], m3
+ movu m0, [srcq+128+64]
+ movu m1, [srcq+128+80]
+ movu m2, [srcq+128+96]
+ movu m3, [srcq+128+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+128+64]
+ pavg m1, [dstq+128+80]
+ pavg m2, [dstq+128+96]
+ pavg m3, [dstq+128+112]
+%endif
+ mova [dstq+128+64], m0
+ mova [dstq+128+80], m1
+ mova [dstq+128+96], m2
+ mova [dstq+128+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop256
+ RET
+%endif
+
+.w128:
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop128
+ RET
+
+%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+%ifidn %2, highbd
+ cmp r4d, 64
+ je .w64
+
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop128
+ RET
+%endif
+%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+.w64:
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq +16]
+ pavg m2, [dstq+dst_strideq]
+ pavg m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+%ifnidn %2, highbd
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movd m0, [srcq]
+ movd m1, [srcq+src_strideq]
+ movd m2, [srcq+src_strideq*2]
+ movd m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movd m4, [dstq]
+ movd m5, [dstq+dst_strideq]
+ movd m6, [dstq+dst_strideq*2]
+ movd m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movd [dstq ], m0
+ movd [dstq+dst_strideq ], m1
+ movd [dstq+dst_strideq*2], m2
+ movd [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+%if CONFIG_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..e6d357ba3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,965 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 1, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 1, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..9e2ec748c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,497 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm8, rdx
+ movq xmm5, rcx
+ pshufd xmm8, xmm8, 0b
+ movdqa xmm1, xmm8
+ psllw xmm8, xmm5
+ psubw xmm8, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm9, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm9, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm9, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm9, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm9, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm9 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+ pminsw xmm2, xmm8
+ pmaxsw xmm2, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+%endif
+
+global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..61476b8be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static void aom_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+ srcReg32b1 = _mm256_inserti128_si256(
+ srcReg32b1,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
+ 1);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(
+ srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+ srcReg32b2 = _mm256_inserti128_si256(
+ srcReg32b2,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
+ 1);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(
+ srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // filter the source buffer
+ srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch),
+ _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 =
+ _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
+
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
+ srcReg32b2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+ srcReg32b3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+ srcReg32b4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+ srcReg32b5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+ srcReg32b6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+ _mm256_castsi256_si128(srcReg32b2), 1);
+ srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+ _mm256_castsi256_si128(srcReg32b3), 1);
+ srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+ _mm256_castsi256_si128(srcReg32b4), 1);
+ srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+ _mm256_castsi256_si128(srcReg32b5), 1);
+ srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+ _mm256_castsi256_si128(srcReg32b6), 1);
+ srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+ _mm256_castsi256_si128(srcReg32b7), 1);
+
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+ // save
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ // save
+ srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr += src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch),
+ _mm256_extractf128_si256(srcReg32b1, 1));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 =
+ _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+ srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 =
+ _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
+#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
+#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
+#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
+#endif // ARCH_X86_64
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 000000000..be37738df
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
+
+void aom_filter_block1d4_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, srcReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
+ shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // extract the higher half of the lane
+ srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+ srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+ minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+ // add and saturate all the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ src_ptr += src_pixels_per_line;
+
+ // save only 4 bytes
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d8_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
+ srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+ // add and saturate all the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr += src_pixels_per_line;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d8_v8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, minReg;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ // load the first 7 rows of 8 bytes
+ srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+ for (i = 0; i < output_height; i++) {
+ // load the last 8 bytes
+ srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the result together
+ srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+ // merge the result together
+ srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+ // add and saturate the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr += src_pitch;
+
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+ output_ptr += out_pitch;
+ }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
+#if CONFIG_LOOP_RESTORATION
+filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
+#endif
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ ssse3);
+
+#if CONFIG_LOOP_RESTORATION
+FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
+ ssse3);
+FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
+ src - src_stride * 3, add_src_, ssse3);
+#endif
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
+ const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
+ const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
+ const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
+ \
+ const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
+ const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
+ const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
+ const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
+ \
+ out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
+ out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
+ out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
+ out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
+ out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
+ out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
+ out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
+ out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
+ }
+
+static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
+ // pack and duplicate the filter values
+ const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+ const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
+ const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
+ const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
+ const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
+ const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
+ const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
+ const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
+ const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
+ // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+ const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+ // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+ const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+ // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
+ const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
+ // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
+ const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
+ // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+ const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
+ const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
+ const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
+ const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
+ const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+ const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+ const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+ const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+ // add and saturate the results together
+ const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+ const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, min_x2x1);
+ temp = _mm_adds_epi16(temp, max_x2x1);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_mulhrs_epi16(temp, k_256);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride) {
+ __m128i A, B, C, D, E, F, G, H;
+
+ A = _mm_loadl_epi64((const __m128i *)src);
+ B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+ C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+ E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
+ F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
+ G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
+ H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
+
+ TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
+
+ _mm_storel_epi64((__m128i *)dst, A);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 8) {
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+ }
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *filter) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+ const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ // TRANSPOSE...
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ //
+ // TO
+ //
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+ // 04 14 24 34
+ // 05 15 25 35
+ // 06 16 26 36
+ // 07 17 27 37
+ //
+ // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+ const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+ // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+ const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+ // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+ const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+ const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ // 02 03 12 13 22 23 32 33
+ const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+ // 06 07 16 17 26 27 36 37
+ const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+ const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+ const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+ const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+ // add and saturate the results together
+ const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+ const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, min_x2x1);
+ temp = _mm_adds_epi16(temp, max_x2x1);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_mulhrs_epi16(temp, k_256);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride) {
+ __m128i A = _mm_cvtsi32_si128(*(const int *)src);
+ __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
+ __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
+ __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
+ // 00 10 01 11 02 12 03 13
+ const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
+ // 20 30 21 31 22 32 23 33
+ const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ A = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ B = _mm_srli_si128(A, 4);
+ C = _mm_srli_si128(A, 8);
+ D = _mm_srli_si128(A, 12);
+
+ *(int *)(dst) = _mm_cvtsi128_si32(A);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
+ *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
+ *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 4) {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+ }
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *filter) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+ const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
+ const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
+ const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
+ const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
+ const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
+ const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
+ const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
+ const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
+ const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+ const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+ const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+ const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+ const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+ const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+ const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+ // add and saturate the results together
+ const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+ const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, min_x2x1);
+ temp = _mm_adds_epi16(temp, max_x2x1);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_mulhrs_epi16(temp, k_256);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *filter) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+ const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+ const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+ const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+ const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+ const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+ const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+ const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+ const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+ // add and saturate the results together
+ const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+ const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, min_x2x1);
+ temp = _mm_adds_epi16(temp, max_x2x1);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_mulhrs_epi16(temp, k_256);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *filter, int w) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+ int i;
+
+ for (i = 0; i < w; i += 16) {
+ const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
+ const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ const __m128i C =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ const __m128i D =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ const __m128i E =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ const __m128i F =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ const __m128i G =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+ const __m128i H =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+ // merge the result together
+ const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
+ const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
+ const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
+ const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
+ const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
+ const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
+ const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
+ // add and saturate the results together
+ const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
+ const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
+ // merge the result together
+ const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
+ const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
+ const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
+ // merge the result together
+ const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
+ const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
+ const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
+ // add and saturate the results together
+ __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
+ __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
+
+ // add and saturate the results together
+ temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
+ temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
+ // round and shift by 7 bit each 16 bit
+ temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
+ temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+ src_ptr += 16;
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
+ }
+}
+
+static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+ x_step_q4, w, intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+ x_step_q4, w, intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+ y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+ y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+ y_step_q4, w, h);
+ }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+}
+
+// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_, ssse3);
+#if CONFIG_LOOP_RESTORATION
+FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
+#endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..b946010d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,990 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(aom_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(aom_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(aom_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(aom_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(aom_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(aom_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(aom_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 000000000..357f37401
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,883 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64: times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+ ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by
+ ; interleaving the instructions.
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ packsswb m4, m4
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+ ; some platforms.
+ pshuflw m0, m4, 0b ;k0_k1
+ pshuflw m1, m4, 01010101b ;k2_k3
+ pshuflw m2, m4, 10101010b ;k4_k5
+ pshuflw m3, m4, 11111111b ;k6_k7
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ mova k0k1, m0
+ mova k2k3, m1
+ mova k4k5, m2
+ mova k6k7, m3
+%if ARCH_X86_64
+ %define krd m12
+ %define tmp0 [rsp + 16*4]
+ %define tmp1 [rsp + 16*5]
+ mova krd, [GLOBAL(pw_64)]
+%else
+ %define krd [rsp + 16*4]
+%if CONFIG_PIC=0
+ mova m6, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m6, m6 ;all ones
+ psrlw m6, 15
+ psllw m6, 6 ;aka pw_64
+%endif
+ mova krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if ARCH_X86_64
+ %define LOCAL_VARS_SIZE_H4 0
+%else
+ %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ packsswb m4, m4
+%if ARCH_X86_64
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ mova krd, [GLOBAL(pw_64)]
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ pshuflw m6, m4, 0b ;k0_k1
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
+ pshuflw m7, m4, 01010101b ;k2_k3
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+ mova m1, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m1, m1 ;all ones
+ psrlw m1, 15
+ psllw m1, 6 ;aka pw_64
+%endif
+ mova k0k1k4k5, m6
+ mova k2k3k6k7, m7
+ mova krd, m1
+%endif
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m4, [srcq - 3]
+ movu m5, [srcq + sstrideq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ punpckhbw m3, m5, m5
+ punpcklbw m5, m5
+ palignr m0, m1, m4, 1
+ pmaddubsw m0, k0k1k4k5
+ palignr m1, m4, 5
+ pmaddubsw m1, k2k3k6k7
+ palignr m2, m3, m5, 1
+ pmaddubsw m2, k0k1k4k5
+ palignr m3, m5, 5
+ pmaddubsw m3, k2k3k6k7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ movd m5, [dstq + dstrideq]
+%endif
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+ psrldq m1, m0, 4
+
+%ifidn %1, h8_avg
+ pavgb m0, m4
+ pavgb m1, m5
+%endif
+ movd [dstq], m0
+ movd [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m4, [srcq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ palignr m0, m1, m4, 1
+ palignr m1, m4, 5
+ pmaddubsw m0, k0k1k4k5
+ pmaddubsw m1, k2k3k6k7
+ psrldq m2, m0, 8
+ psrldq m3, m1, 8
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ pavgb m0, m4
+%endif
+ movd [dstq], m0
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m0, [srcq - 3]
+ movu m4, [srcq + sstrideq - 3]
+ punpckhbw m1, m0, m0
+ punpcklbw m0, m0
+ palignr m5, m1, m0, 13
+ pmaddubsw m5, k6k7
+ palignr m2, m1, m0, 5
+ palignr m3, m1, m0, 9
+ palignr m1, m0, 1
+ pmaddubsw m1, k0k1
+ punpckhbw m6, m4, m4
+ punpcklbw m4, m4
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+
+ palignr m7, m6, m4, 13
+ palignr m0, m6, m4, 5
+ pmaddubsw m7, k6k7
+ paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+%ifidn %1, h8_avg
+ movh m2, [dstq]
+ movhps m2, [dstq + dstrideq]
+%endif
+ palignr m5, m6, m4, 9
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
+ pmaddubsw m6, k0k1
+ paddsw m1, krd
+ pmaddubsw m5, k4k5
+ psraw m1, 7
+ paddsw m0, m7
+ paddsw m6, m5
+ paddsw m6, m0
+ paddsw m6, krd
+ psraw m6, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpcklbw m4, m3
+ punpcklbw m5, m3
+ paddsw m1, m4
+ paddsw m6, m5
+%endif
+ packuswb m1, m6
+%ifidn %1, h8_avg
+ pavgb m1, m2
+%endif
+ movh [dstq], m1
+ movhps [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m0, [srcq - 3]
+ punpckhbw m3, m0, m0
+ punpcklbw m0, m0
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 5
+ palignr m4, m3, m0, 13
+ palignr m3, m0, 9
+ pmaddubsw m1, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+ pmaddubsw m4, k6k7
+ paddsw m1, m3
+ paddsw m4, m2
+ paddsw m1, m4
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, h8_add_src
+ pxor m6, m6
+ movu m5, [srcq]
+ punpcklbw m5, m6
+ paddsw m1, m5
+%endif
+ packuswb m1, m1
+%ifidn %1, h8_avg
+ movh m0, [dstq]
+ pavgb m1, m0
+%endif
+ movh [dstq], m1
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+.loop:
+ prefetcht0 [srcq + 2 * sstrideq -3]
+
+ movu m0, [srcq - 3]
+ movu m4, [srcq - 2]
+ pmaddubsw m0, k0k1
+ pmaddubsw m4, k0k1
+ movu m1, [srcq - 1]
+ movu m5, [srcq + 0]
+ pmaddubsw m1, k2k3
+ pmaddubsw m5, k2k3
+ movu m2, [srcq + 1]
+ movu m6, [srcq + 2]
+ pmaddubsw m2, k4k5
+ pmaddubsw m6, k4k5
+ movu m3, [srcq + 3]
+ movu m7, [srcq + 4]
+ pmaddubsw m3, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ paddsw m4, m5
+ paddsw m0, krd
+ paddsw m4, krd
+ psraw m0, 7
+ psraw m4, 7
+%ifidn %1, h8_add_src
+ movu m5, [srcq]
+ mova m7, m5
+ pand m5, [even_byte_mask]
+ psrlw m7, 8
+ paddsw m0, m5
+ paddsw m4, m7
+%endif
+ packuswb m0, m0
+ packuswb m4, m4
+ punpcklbw m0, m4
+%ifidn %1, h8_avg
+ pavgb m0, [dstq]
+%endif
+ lea srcq, [srcq + sstrideq]
+ mova [dstq], m0
+ lea dstq, [dstq + dstrideq]
+ dec heightd
+ jnz .loop
+ REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER16 h8_avg
+SUBPIX_HFILTER8 h8
+SUBPIX_HFILTER8 h8_avg
+SUBPIX_HFILTER4 h8
+SUBPIX_HFILTER4 h8_avg
+
+%if CONFIG_LOOP_RESTORATION
+SUBPIX_HFILTER16 h8_add_src
+SUBPIX_HFILTER8 h8_add_src
+SUBPIX_HFILTER4 h8_add_src
+%endif
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+ %define NUM_GENERAL_REG_USED 9
+%else
+ %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+ %define movx movh
+%else
+ %define movx movd
+%endif
+
+ dec heightd
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ ;Do two rows at once
+ movx m0, [srcq ] ;A
+ movx m1, [src1q ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, v8_add_src
+ movu m4, [src1q]
+ punpcklbw m4, m6
+ paddsw m1, m4
+%endif
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
+
+%ifidn %1, v8_avg
+ movx m2, [dstq]
+ pavgb m0, m2
+%endif
+ movx [dstq], m0
+ add dstq, dst_stride
+%ifidn %1, v8_avg
+ movx m3, [dstq]
+ pavgb m1, m3
+%endif
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%else
+ ; ARCH_X86_64
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m2, [srcq] ;C
+ movx m3, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m4, [srcq] ;E
+ movx m5, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m6, [srcq] ;G
+ punpcklbw m0, m1 ;A B
+ punpcklbw m1, m2 ;A B next iter
+ punpcklbw m2, m3 ;C D
+ punpcklbw m3, m4 ;C D next iter
+ punpcklbw m4, m5 ;E F
+ punpcklbw m5, m6 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ movx m7, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m14, [srcq] ;H next iter
+ punpcklbw m6, m7 ;G H
+ punpcklbw m7, m14 ;G H next iter
+ pmaddubsw m8, m0, k0k1
+ pmaddubsw m9, m1, k0k1
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m10, m2, k2k3
+ pmaddubsw m11, m3, k2k3
+ mova m2, m4
+ mova m3, m5
+ pmaddubsw m4, k4k5
+ pmaddubsw m5, k4k5
+ paddsw m8, m4
+ paddsw m9, m5
+ mova m4, m6
+ mova m5, m7
+ pmaddubsw m6, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m10, m6
+ paddsw m11, m7
+ paddsw m8, m10
+ paddsw m9, m11
+ mova m6, m14
+ paddsw m8, krd
+ paddsw m9, krd
+ psraw m8, 7
+ psraw m9, 7
+%ifidn %2, 4
+ packuswb m8, m8
+ packuswb m9, m9
+%else
+ packuswb m8, m9
+%endif
+
+%ifidn %1, v8_avg
+ movx m7, [dstq]
+%ifidn %2, 4
+ movx m10, [dstq + dstrideq]
+ pavgb m9, m10
+%else
+ movhpd m7, [dstq + dstrideq]
+%endif
+ pavgb m8, m7
+%endif
+ movx [dstq], m8
+%ifidn %2, 4
+ movx [dstq + dstrideq], m9
+%else
+ movhpd [dstq + dstrideq], m8
+%endif
+
+ lea dstq, [dstq + dstrideq * 2 ]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m7, [srcq + sstrideq] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ pmaddubsw m6, k6k7
+ paddsw m0, m4
+ paddsw m2, m6
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%endif ; ARCH_X86_64
+
+.done:
+ REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ lea src1q, [srcq + sstrideq]
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ movh m0, [srcq ] ;A
+ movh m1, [src1q ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
+
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+ mova m5, m4
+ punpcklbw m4, m6
+ punpckhbw m5, m6
+ paddsw m0, m4
+ paddsw m3, m5
+%endif
+ packuswb m0, m3
+
+ add srcq, sstrideq
+ add src1q, sstrideq
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ REP_RET
+
+%else
+ ; ARCH_X86_64
+ dec heightd
+
+ movu m1, [srcq ] ;A
+ movu m3, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m0, m1, m3 ;A B
+ punpckhbw m1, m3 ;A B
+ movu m5, [srcq] ;C
+ punpcklbw m2, m3, m5 ;A B next iter
+ punpckhbw m3, m5 ;A B next iter
+ mova tmp0, m2 ;store to stack
+ mova tmp1, m3 ;store to stack
+ movu m7, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m4, m5, m7 ;C D
+ punpckhbw m5, m7 ;C D
+ movu m9, [srcq] ;E
+ punpcklbw m6, m7, m9 ;C D next iter
+ punpckhbw m7, m9 ;C D next iter
+ movu m11, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m8, m9, m11 ;E F
+ punpckhbw m9, m11 ;E F
+ movu m2, [srcq] ;G
+ punpcklbw m10, m11, m2 ;E F next iter
+ punpckhbw m11, m2 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ pmaddubsw m13, m0, k0k1
+ mova m0, m4
+ pmaddubsw m14, m8, k4k5
+ pmaddubsw m15, m4, k2k3
+ mova m4, m8
+ paddsw m13, m14
+ movu m3, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m14, m2, m3 ;G H
+ mova m8, m14
+ pmaddubsw m14, k6k7
+ paddsw m15, m14
+ paddsw m13, m15
+ paddsw m13, krd
+ psraw m13, 7
+
+ pmaddubsw m14, m1, k0k1
+ pmaddubsw m1, m9, k4k5
+ pmaddubsw m15, m5, k2k3
+ paddsw m14, m1
+ mova m1, m5
+ mova m5, m9
+ punpckhbw m2, m3 ;G H
+ mova m9, m2
+ pmaddubsw m2, k6k7
+ paddsw m15, m2
+ paddsw m14, m15
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m13, m14
+%ifidn %1, v8_avg
+ pavgb m13, [dstq]
+%endif
+ mova [dstq], m13
+
+ ; next iter
+ pmaddubsw m15, tmp0, k0k1
+ pmaddubsw m14, m10, k4k5
+ pmaddubsw m13, m6, k2k3
+ paddsw m15, m14
+ mova tmp0, m6
+ mova m6, m10
+ movu m2, [srcq] ;G next iter
+ punpcklbw m14, m3, m2 ;G H next iter
+ mova m10, m14
+ pmaddubsw m14, k6k7
+ paddsw m13, m14
+ paddsw m15, m13
+ paddsw m15, krd
+ psraw m15, 7
+
+ pmaddubsw m14, tmp1, k0k1
+ mova tmp1, m7
+ pmaddubsw m13, m7, k2k3
+ mova m7, m11
+ pmaddubsw m11, k4k5
+ paddsw m14, m11
+ punpckhbw m3, m2 ;G H next iter
+ mova m11, m3
+ pmaddubsw m3, k6k7
+ paddsw m13, m3
+ paddsw m14, m13
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m15, m14
+%ifidn %1, v8_avg
+ pavgb m15, [dstq + dstrideq]
+%endif
+ mova [dstq + dstrideq], m15
+ lea dstq, [dstq + dstrideq * 2]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m3, [srcq + sstrideq] ;H
+ punpcklbw m6, m2, m3 ;G H
+ punpckhbw m2, m3 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m1, k0k1
+ pmaddubsw m4, k2k3
+ pmaddubsw m5, k2k3
+ pmaddubsw m8, k4k5
+ pmaddubsw m9, k4k5
+ pmaddubsw m6, k6k7
+ pmaddubsw m2, k6k7
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m4, m6
+ paddsw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m0, krd
+ paddsw m1, krd
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+
+.done:
+ REP_RET
+
+%endif ; ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16 v8
+SUBPIX_VFILTER16 v8_avg
+SUBPIX_VFILTER v8, 8
+SUBPIX_VFILTER v8_avg, 8
+SUBPIX_VFILTER v8, 4
+SUBPIX_VFILTER v8_avg, 4
+
+%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
+ CONFIG_LOOP_RESTORATION
+SUBPIX_VFILTER16 v8_add_src
+SUBPIX_VFILTER v8_add_src, 8
+SUBPIX_VFILTER v8_add_src, 4
+%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..8f025a8be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,451 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+global sym(aom_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(aom_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(aom_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(aom_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(aom_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(aom_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(aom_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..b9b2da0be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,421 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movd xmm2, ecx ;rounding_shift
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movd xmm6, ecx ;rounding_shift
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ pmulhrsw xmm2, xmm6
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
+sym(aom_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
+sym(aom_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
+sym(aom_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d4_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d8_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d16_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
+sym(aom_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
+sym(aom_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
+sym(aom_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d4_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d8_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(aom_filter_block1d16_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 000000000..bcdc20f63
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0);
+ s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 8) >> 4;
+}
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ int16_t const *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ _mm_store_si128((__m128i *)coeff, coeff0);
+ _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ }
+}
+
+int aom_satd_sse2(const int16_t *coeff, int length) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
+
+ for (i = 0; i < length; i += 8) {
+ const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i inv = _mm_sub_epi16(zero, src_line);
+ const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
+ const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+ const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+ const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+ accum = _mm_add_epi32(accum, sum);
+ coeff += 8;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
+ int height) {
+ int idx;
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+ __m128i t0, t1;
+ int height_1 = height - 1;
+ ref += ref_stride;
+
+ for (idx = 1; idx < height_1; idx += 2) {
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+ }
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+
+ if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
+ _mm_storeu_si128((__m128i *)hbuf, s0);
+ hbuf += 8;
+ _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_load_si128((const __m128i *)ref);
+ __m128i s0 = _mm_sad_epu8(src_line, zero);
+ __m128i s1;
+ int i;
+
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_load_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ return _mm_extract_epi16(s0, 0);
+}
+
+int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
+ int idx;
+ int width = 4 << bwl;
+ int16_t mean;
+ __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i v1 = _mm_load_si128((const __m128i *)src);
+ __m128i diff = _mm_subs_epi16(v0, v1);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
+
+ ref += 8;
+ src += 8;
+
+ for (idx = 8; idx < width; idx += 8) {
+ v0 = _mm_loadu_si128((const __m128i *)ref);
+ v1 = _mm_load_si128((const __m128i *)src);
+ diff = _mm_subs_epi16(v0, v1);
+
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
+
+ ref += 8;
+ src += 8;
+ }
+
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = _mm_extract_epi16(sum, 0);
+
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
new file mode 100644
index 000000000..b2d150296
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
@@ -0,0 +1,124 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix aom
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the hadamard transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION .text
+
+%if ARCH_X86_64
+; matrix transpose
+%macro INTERLEAVE_2X 4
+ punpckh%1 m%4, m%2, m%3
+ punpckl%1 m%2, m%3
+ SWAP %3, %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+ INTERLEAVE_2X wd, %1, %2, %9
+ INTERLEAVE_2X wd, %3, %4, %9
+ INTERLEAVE_2X wd, %5, %6, %9
+ INTERLEAVE_2X wd, %7, %8, %9
+
+ INTERLEAVE_2X dq, %1, %3, %9
+ INTERLEAVE_2X dq, %2, %4, %9
+ INTERLEAVE_2X dq, %5, %7, %9
+ INTERLEAVE_2X dq, %6, %8, %9
+
+ INTERLEAVE_2X qdq, %1, %5, %9
+ INTERLEAVE_2X qdq, %3, %7, %9
+ INTERLEAVE_2X qdq, %2, %6, %9
+ INTERLEAVE_2X qdq, %4, %8, %9
+
+ SWAP %2, %5
+ SWAP %4, %7
+%endmacro
+
+%macro HMD8_1D 0
+ psubw m8, m0, m1
+ psubw m9, m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ SWAP 1, 8
+ SWAP 3, 9
+ psubw m8, m4, m5
+ psubw m9, m6, m7
+ paddw m4, m5
+ paddw m6, m7
+ SWAP 5, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m2
+ psubw m9, m1, m3
+ paddw m0, m2
+ paddw m1, m3
+ SWAP 2, 8
+ SWAP 3, 9
+ psubw m8, m4, m6
+ psubw m9, m5, m7
+ paddw m4, m6
+ paddw m5, m7
+ SWAP 6, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m4
+ psubw m9, m1, m5
+ paddw m0, m4
+ paddw m1, m5
+ SWAP 4, 8
+ SWAP 5, 9
+ psubw m8, m2, m6
+ psubw m9, m3, m7
+ paddw m2, m6
+ paddw m3, m7
+ SWAP 6, 8
+ SWAP 7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ HMD8_1D
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+ HMD8_1D
+
+ mova [outputq + 0], m0
+ mova [outputq + 16], m1
+ mova [outputq + 32], m2
+ mova [outputq + 48], m3
+ mova [outputq + 64], m4
+ mova [outputq + 80], m5
+ mova [outputq + 96], m6
+ mova [outputq + 112], m7
+
+ RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 000000000..e916e4ff9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "./aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+ src1_8, src1_stride, mask, 0, h, w, 0, 0,
+ bd);
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 000000000..68d74e517
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "./aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_m0l_b = xx_loadl_64(mask + c);
+ const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+ const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+ const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+ const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+ const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zero = _mm_setzero_si128();
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ra_b = xx_loadu_128(mask + c);
+ const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+ const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+ const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+ const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+ const __m128i v_rvsbl_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+ const __m128i v_rvsbh_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+ const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ int w, int suby, int subx) {
+ typedef void (*blend_fn)(
+ uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w);
+
+ // Dimensions are: width_index X subx X suby
+ static const blend_fn blend[3][2][2] = {
+ { // w % 16 == 0
+ { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+ { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+ { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+ { // w == 8
+ { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+ { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, h, w, suby, subx);
+ } else {
+ blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+ src0_stride, src1, src1_stride,
+ mask, mask_stride, h, w);
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_m0_b = xx_loadl_64(mask + c);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadl_64(mask + c);
+ const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+ 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int h, int w,
+ int suby, int subx, int bd) {
+ typedef void (*blend_fn)(
+ uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w);
+
+ // Dimensions are: bd_index X width_index X subx X suby
+ static const blend_fn blend[2][2][2][2] = {
+ { // bd == 8 or 10
+ { // w % 8 == 0
+ { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+ { blend_a64_mask_b10_sx_w8n_sse4_1,
+ blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+ { blend_a64_mask_b10_sx_w4_sse4_1,
+ blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+ { // bd == 12
+ { // w % 8 == 0
+ { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+ { blend_a64_mask_b12_sx_w8n_sse4_1,
+ blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+ { blend_a64_mask_b12_sx_w4_sse4_1,
+ blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, mask_stride, h, w, suby,
+ subx, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w);
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 000000000..9dabe5b79
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "./aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0,
+ uint32_t src0_stride,
+ const uint8_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimension: width_index
+ static const blend_fn blend[9] = {
+ blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
+ aom_blend_a64_vmask_c, // w == 1
+ aom_blend_a64_vmask_c, // w == 2
+ NULL, // INVALID
+ blend_a64_vmask_w4_sse4_1, // w == 4
+ NULL, // INVALID
+ NULL, // INVALID
+ NULL, // INVALID
+ blend_a64_vmask_w8_sse4_1, // w == 8
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
+ w);
+}
+
+#if CONFIG_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimensions are: bd_index X width_index
+ static const blend_fn blend[2][2] = {
+ {
+ // bd == 8 or 10
+ blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b10_w4_sse4_1, // w == 4
+ },
+ {
+ // bd == 12
+ blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b12_w4_sse4_1, // w == 4
+ }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, h, w, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w);
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 000000000..daa2b2b3a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+#if CONFIG_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d =
+ _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d =
+ _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+ const __m128i v_ssumh_d =
+ _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#endif // AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
new file mode 100644
index 000000000..8641164db
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_DSP_X86_CONVOLVE_H_
+#define AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_convolve.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ (void)filter_x; \
+ (void)x_step_q4; \
+ (void)filter_y; \
+ (void)y_step_q4; \
+ assert((-128 <= filter[3]) && (filter[3] <= 127)); \
+ assert(step_q4 == 16); \
+ if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ if (w) { \
+ aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+ }
+
+#define FUN_CONV_2D(avg, opt) \
+ void aom_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
+ assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
+ assert(w <= MAX_SB_SIZE); \
+ assert(h <= MAX_SB_SIZE); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \
+ filter_y[1] || filter_y[2]) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
+ aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \
+ MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h + 7); \
+ aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+ dst, dst_stride, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
+ aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, \
+ h + 1); \
+ aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } \
+ }
+
+#if CONFIG_LOOP_RESTORATION
+// convolve_add_src is only used by the Wiener filter, which will never
+// end up calling the bilinear functions (it uses a symmetric filter, so
+// the possible numbers of taps are 1,3,5,7)
+#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
+ opt) \
+ void aom_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ (void)filter_x; \
+ (void)x_step_q4; \
+ (void)filter_y; \
+ (void)y_step_q4; \
+ assert((-128 <= filter[3]) && (filter[3] <= 127)); \
+ assert(step_q4 == 16); \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ if (w) { \
+ aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+ }
+
+#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \
+ void aom_convolve8_##type##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
+ assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
+ assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
+ assert(w <= MAX_SB_SIZE); \
+ assert(h <= MAX_SB_SIZE); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ aom_convolve8_##htype##horiz_##opt( \
+ src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h + 7); \
+ aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+ dst, dst_stride, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h); \
+ }
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_highbd_convolve8_##name##_##opt( \
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ aom_highbd_convolve8_##name##_c( \
+ CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \
+ dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } \
+ }
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+ void aom_highbd_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ assert(w <= MAX_SB_SIZE); \
+ assert(h <= MAX_SB_SIZE); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, \
+ fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
+ aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), \
+ MAX_SB_SIZE, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h + 7, bd); \
+ aom_highbd_convolve8_##avg##vert_##opt( \
+ CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \
+ dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, \
+ fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
+ aom_highbd_convolve8_horiz_##opt( \
+ src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \
+ aom_highbd_convolve8_##avg##vert_##opt( \
+ CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+#endif // AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
new file mode 100644
index 000000000..b8ec08de7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
@@ -0,0 +1,862 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "aom_dsp/fwd_txfm.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+// Apply a 32-element IDCT to 8 columns. This does not do any transposition
+// of its output - the caller is expected to do that.
+// The input buffers are the top and bottom halves of an 8x32 block.
+void fdct32_8col(__m128i *in0, __m128i *in1) {
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ {
+ const __m128i *ina = in0;
+ const __m128i *inb = in1 + 15;
+ __m128i *step1a = &step1[0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ }
+ {
+ const __m128i *ina = in0 + 4;
+ const __m128i *inb = in1 + 11;
+ __m128i *step1a = &step1[4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ }
+ {
+ const __m128i *ina = in0 + 8;
+ const __m128i *inb = in1 + 7;
+ __m128i *step1a = &step1[8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ }
+ {
+ const __m128i *ina = in0 + 12;
+ const __m128i *inb = in1 + 3;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ }
+ // Stage 2
+ {
+ step2[0] = _mm_add_epi16(step1[0], step1[15]);
+ step2[1] = _mm_add_epi16(step1[1], step1[14]);
+ step2[2] = _mm_add_epi16(step1[2], step1[13]);
+ step2[3] = _mm_add_epi16(step1[3], step1[12]);
+ step2[4] = _mm_add_epi16(step1[4], step1[11]);
+ step2[5] = _mm_add_epi16(step1[5], step1[10]);
+ step2[6] = _mm_add_epi16(step1[6], step1[9]);
+ step2[7] = _mm_add_epi16(step1[7], step1[8]);
+ step2[8] = _mm_sub_epi16(step1[7], step1[8]);
+ step2[9] = _mm_sub_epi16(step1[6], step1[9]);
+ step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+ }
+ // Stage 3
+ {
+ step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[0] = _mm_add_epi16(step3[3], step3[0]);
+ step1[1] = _mm_add_epi16(step3[2], step3[1]);
+ step1[2] = _mm_sub_epi16(step3[1], step3[2]);
+ step1[3] = _mm_sub_epi16(step3[0], step3[3]);
+ step1[8] = _mm_add_epi16(step3[11], step2[8]);
+ step1[9] = _mm_add_epi16(step3[10], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[9], step3[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step3[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[8] = _mm_add_epi16(step2[9], step1[8]);
+ step3[9] = _mm_sub_epi16(step1[8], step2[9]);
+ step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+ }
+
+ // Output results
+ {
+ int j;
+ for (j = 0; j < 16; ++j) {
+ _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
+ _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
+ }
+ }
+} // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
new file mode 100644
index 000000000..216739581
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -0,0 +1,3022 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_intrin.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+ __m256i buf0, buf1;
+ buf0 = _mm256_mul_epu32(a, b);
+ a = _mm256_srli_epi64(a, 32);
+ b = _mm256_srli_epi64(b, 32);
+ buf1 = _mm256_mul_epu32(a, b);
+ return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+ __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+#ifndef STORE_COEFF_FUNC
+#define STORE_COEFF_FUNC
+static void store_coeff(const __m256i *coeff, tran_low_t *curr,
+ tran_low_t *next) {
+ __m128i u = _mm256_castsi256_si128(*coeff);
+ storeu_output(&u, curr);
+ u = _mm256_extractf128_si256(*coeff, 1);
+ storeu_output(&u, next);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
+ int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i k__cospi_p16_m16 =
+ pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i k__cospi_m12_m20 =
+ pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ const __m256i kZero = _mm256_set1_epi16(0);
+ const __m256i kOne = _mm256_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process sixteen columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 16) {
+ __m256i step1[32];
+ __m256i step2[32];
+ __m256i step3[32];
+ __m256i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m256i *step1a = &step1[0];
+ __m256i *step1b = &step1[31];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m256i *step1a = &step1[4];
+ __m256i *step1b = &step1[27];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m256i *step1a = &step1[8];
+ __m256i *step1b = &step1[23];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m256i *step1a = &step1[12];
+ __m256i *step1b = &step1[19];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+ __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+ __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+ __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+ __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+ __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+ __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+ __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+ step1[0] = _mm256_add_epi16(in00, in31);
+ step1[1] = _mm256_add_epi16(in01, in30);
+ step1[2] = _mm256_add_epi16(in02, in29);
+ step1[3] = _mm256_add_epi16(in03, in28);
+ step1[28] = _mm256_sub_epi16(in03, in28);
+ step1[29] = _mm256_sub_epi16(in02, in29);
+ step1[30] = _mm256_sub_epi16(in01, in30);
+ step1[31] = _mm256_sub_epi16(in00, in31);
+ }
+ {
+ __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+ __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+ __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+ __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+ __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+ __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+ __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+ __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+ step1[4] = _mm256_add_epi16(in04, in27);
+ step1[5] = _mm256_add_epi16(in05, in26);
+ step1[6] = _mm256_add_epi16(in06, in25);
+ step1[7] = _mm256_add_epi16(in07, in24);
+ step1[24] = _mm256_sub_epi16(in07, in24);
+ step1[25] = _mm256_sub_epi16(in06, in25);
+ step1[26] = _mm256_sub_epi16(in05, in26);
+ step1[27] = _mm256_sub_epi16(in04, in27);
+ }
+ {
+ __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+ __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+ __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+ __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+ __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+ __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+ __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+ __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+ step1[8] = _mm256_add_epi16(in08, in23);
+ step1[9] = _mm256_add_epi16(in09, in22);
+ step1[10] = _mm256_add_epi16(in10, in21);
+ step1[11] = _mm256_add_epi16(in11, in20);
+ step1[20] = _mm256_sub_epi16(in11, in20);
+ step1[21] = _mm256_sub_epi16(in10, in21);
+ step1[22] = _mm256_sub_epi16(in09, in22);
+ step1[23] = _mm256_sub_epi16(in08, in23);
+ }
+ {
+ __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+ __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+ __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+ __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+ __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+ __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+ __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+ __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+ step1[12] = _mm256_add_epi16(in12, in19);
+ step1[13] = _mm256_add_epi16(in13, in18);
+ step1[14] = _mm256_add_epi16(in14, in17);
+ step1[15] = _mm256_add_epi16(in15, in16);
+ step1[16] = _mm256_sub_epi16(in15, in16);
+ step1[17] = _mm256_sub_epi16(in14, in17);
+ step1[18] = _mm256_sub_epi16(in13, in18);
+ step1[19] = _mm256_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = _mm256_add_epi16(step1[0], step1[15]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[14]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[13]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[12]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[11]);
+ step2[5] = _mm256_add_epi16(step1[5], step1[10]);
+ step2[6] = _mm256_add_epi16(step1[6], step1[9]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[8]);
+ step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
+ step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
+ step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+ const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+ const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+ const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+ const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+ const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+ const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+ const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+ const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s2_20_4 =
+ _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_5 =
+ _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_4 =
+ _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_5 =
+ _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_4 =
+ _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_5 =
+ _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_4 =
+ _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_5 =
+ _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_4 =
+ _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_5 =
+ _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_4 =
+ _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_5 =
+ _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_4 =
+ _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_5 =
+ _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_4 =
+ _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_5 =
+ _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
+ __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
+ __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
+ __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
+ __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
+ __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
+ __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
+ __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
+ __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
+ __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
+ __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
+ __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
+ __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
+ __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
+ __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
+ __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
+ __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
+ __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
+ __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
+ __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
+ __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
+ __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
+ __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
+ __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
+ __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
+ __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
+ __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
+ __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
+ __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
+ __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
+ __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
+ __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
+
+ step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
+ step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
+ step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
+ step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
+ step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
+ step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
+ step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
+ step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
+ step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
+ step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
+ step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+ step2[0] = _mm256_add_epi16(step2[0], kOne);
+ step2[1] = _mm256_add_epi16(step2[1], kOne);
+ step2[2] = _mm256_add_epi16(step2[2], kOne);
+ step2[3] = _mm256_add_epi16(step2[3], kOne);
+ step2[4] = _mm256_add_epi16(step2[4], kOne);
+ step2[5] = _mm256_add_epi16(step2[5], kOne);
+ step2[6] = _mm256_add_epi16(step2[6], kOne);
+ step2[7] = _mm256_add_epi16(step2[7], kOne);
+ step2[8] = _mm256_add_epi16(step2[8], kOne);
+ step2[9] = _mm256_add_epi16(step2[9], kOne);
+ step2[10] = _mm256_add_epi16(step2[10], kOne);
+ step2[11] = _mm256_add_epi16(step2[11], kOne);
+ step2[12] = _mm256_add_epi16(step2[12], kOne);
+ step2[13] = _mm256_add_epi16(step2[13], kOne);
+ step2[14] = _mm256_add_epi16(step2[14], kOne);
+ step2[15] = _mm256_add_epi16(step2[15], kOne);
+ step1[16] = _mm256_add_epi16(step1[16], kOne);
+ step1[17] = _mm256_add_epi16(step1[17], kOne);
+ step1[18] = _mm256_add_epi16(step1[18], kOne);
+ step1[19] = _mm256_add_epi16(step1[19], kOne);
+ step2[20] = _mm256_add_epi16(step2[20], kOne);
+ step2[21] = _mm256_add_epi16(step2[21], kOne);
+ step2[22] = _mm256_add_epi16(step2[22], kOne);
+ step2[23] = _mm256_add_epi16(step2[23], kOne);
+ step2[24] = _mm256_add_epi16(step2[24], kOne);
+ step2[25] = _mm256_add_epi16(step2[25], kOne);
+ step2[26] = _mm256_add_epi16(step2[26], kOne);
+ step2[27] = _mm256_add_epi16(step2[27], kOne);
+ step1[28] = _mm256_add_epi16(step1[28], kOne);
+ step1[29] = _mm256_add_epi16(step1[29], kOne);
+ step1[30] = _mm256_add_epi16(step1[30], kOne);
+ step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm256_srai_epi16(step2[0], 2);
+ step2[1] = _mm256_srai_epi16(step2[1], 2);
+ step2[2] = _mm256_srai_epi16(step2[2], 2);
+ step2[3] = _mm256_srai_epi16(step2[3], 2);
+ step2[4] = _mm256_srai_epi16(step2[4], 2);
+ step2[5] = _mm256_srai_epi16(step2[5], 2);
+ step2[6] = _mm256_srai_epi16(step2[6], 2);
+ step2[7] = _mm256_srai_epi16(step2[7], 2);
+ step2[8] = _mm256_srai_epi16(step2[8], 2);
+ step2[9] = _mm256_srai_epi16(step2[9], 2);
+ step2[10] = _mm256_srai_epi16(step2[10], 2);
+ step2[11] = _mm256_srai_epi16(step2[11], 2);
+ step2[12] = _mm256_srai_epi16(step2[12], 2);
+ step2[13] = _mm256_srai_epi16(step2[13], 2);
+ step2[14] = _mm256_srai_epi16(step2[14], 2);
+ step2[15] = _mm256_srai_epi16(step2[15], 2);
+ step1[16] = _mm256_srai_epi16(step1[16], 2);
+ step1[17] = _mm256_srai_epi16(step1[17], 2);
+ step1[18] = _mm256_srai_epi16(step1[18], 2);
+ step1[19] = _mm256_srai_epi16(step1[19], 2);
+ step2[20] = _mm256_srai_epi16(step2[20], 2);
+ step2[21] = _mm256_srai_epi16(step2[21], 2);
+ step2[22] = _mm256_srai_epi16(step2[22], 2);
+ step2[23] = _mm256_srai_epi16(step2[23], 2);
+ step2[24] = _mm256_srai_epi16(step2[24], 2);
+ step2[25] = _mm256_srai_epi16(step2[25], 2);
+ step2[26] = _mm256_srai_epi16(step2[26], 2);
+ step2[27] = _mm256_srai_epi16(step2[27], 2);
+ step1[28] = _mm256_srai_epi16(step1[28], 2);
+ step1[29] = _mm256_srai_epi16(step1[29], 2);
+ step1[30] = _mm256_srai_epi16(step1[30], 2);
+ step1[31] = _mm256_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[0] = _mm256_add_epi16(step3[3], step3[0]);
+ step1[1] = _mm256_add_epi16(step3[2], step3[1]);
+ step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
+ step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
+ step1[8] = _mm256_add_epi16(step3[11], step2[8]);
+ step1[9] = _mm256_add_epi16(step3[10], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+ const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+ const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s1_05_4 =
+ _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_5 =
+ _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_4 =
+ _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_5 =
+ _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+ const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+ const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+ const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+ const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+ const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+ const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+ const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+ const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s1_18_4 =
+ _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_5 =
+ _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_4 =
+ _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_5 =
+ _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_4 =
+ _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_5 =
+ _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_4 =
+ _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_5 =
+ _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_4 =
+ _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_5 =
+ _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_4 =
+ _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_5 =
+ _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_4 =
+ _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_5 =
+ _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_4 =
+ _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_5 =
+ _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+ const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+ const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+ const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+ const __m256i out_00_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m256i out_00_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m256i out_16_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m256i out_16_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m256i out_08_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m256i out_08_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m256i out_24_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m256i out_24_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m256i out_00_4 =
+ _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_5 =
+ _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_4 =
+ _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_5 =
+ _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_4 =
+ _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_5 =
+ _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_4 =
+ _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_5 =
+ _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
+ const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
+ const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+ const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+ const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s2_09_4 =
+ _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_5 =
+ _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_4 =
+ _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_5 =
+ _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_4 =
+ _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_5 =
+ _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_4 =
+ _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_5 =
+ _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_04_2 =
+ _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m256i out_04_3 =
+ _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m256i out_20_2 =
+ _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m256i out_20_3 =
+ _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m256i out_12_2 =
+ _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m256i out_12_3 =
+ _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m256i out_28_2 =
+ _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m256i out_28_3 =
+ _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m256i out_04_4 =
+ _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_5 =
+ _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_4 =
+ _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_5 =
+ _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_4 =
+ _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_5 =
+ _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_4 =
+ _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_5 =
+ _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[8] = _mm256_add_epi16(step2[9], step1[8]);
+ step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
+ step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+ const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+ const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+ const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+ const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+ const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+ const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+ const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+ const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m256i s3_17_4 =
+ _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_5 =
+ _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_4 =
+ _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_5 =
+ _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_4 =
+ _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_5 =
+ _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_4 =
+ _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_5 =
+ _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m256i s3_25_4 =
+ _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_5 =
+ _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_4 =
+ _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_5 =
+ _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_4 =
+ _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_5 =
+ _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_4 =
+ _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_5 =
+ _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
+ const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
+ const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
+ const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
+ const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+ const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+ const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+ const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+ const __m256i out_02_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m256i out_02_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m256i out_18_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m256i out_18_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m256i out_10_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m256i out_10_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m256i out_26_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m256i out_26_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m256i out_06_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m256i out_06_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m256i out_22_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m256i out_22_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m256i out_14_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m256i out_14_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m256i out_30_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m256i out_30_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m256i out_02_4 =
+ _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_5 =
+ _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_4 =
+ _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_5 =
+ _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_4 =
+ _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_5 =
+ _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_4 =
+ _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_5 =
+ _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_4 =
+ _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_5 =
+ _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_4 =
+ _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_5 =
+ _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_4 =
+ _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_5 =
+ _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_4 =
+ _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_5 =
+ _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+ const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+ const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+ const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+ const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+ const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+ const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+ const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+ const __m256i out_01_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m256i out_01_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m256i out_17_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m256i out_17_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m256i out_09_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m256i out_09_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m256i out_25_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m256i out_25_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m256i out_07_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m256i out_07_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m256i out_23_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m256i out_23_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m256i out_15_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m256i out_15_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m256i out_31_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m256i out_31_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m256i out_01_4 =
+ _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_5 =
+ _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_4 =
+ _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_5 =
+ _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_4 =
+ _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_5 =
+ _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_4 =
+ _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_5 =
+ _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_4 =
+ _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_5 =
+ _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_4 =
+ _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_5 =
+ _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_4 =
+ _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_5 =
+ _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_4 =
+ _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_5 =
+ _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+ const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+ const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+ const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+ const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+ const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+ const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+ const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+ const __m256i out_05_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m256i out_05_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m256i out_21_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m256i out_21_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m256i out_13_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m256i out_13_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m256i out_29_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m256i out_29_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m256i out_03_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m256i out_03_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m256i out_19_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m256i out_19_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m256i out_11_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m256i out_11_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m256i out_27_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m256i out_27_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m256i out_05_4 =
+ _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_5 =
+ _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_4 =
+ _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_5 =
+ _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_4 =
+ _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_5 =
+ _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_4 =
+ _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_5 =
+ _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_4 =
+ _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_5 =
+ _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_4 =
+ _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_5 =
+ _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_4 =
+ _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_5 =
+ _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_4 =
+ _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_5 =
+ _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m256i lstep1[64], lstep2[64], lstep3[64];
+ __m256i u[32], v[32], sign[16];
+ const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
+ lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
+ lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
+ lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
+ lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
+ lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
+ lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
+ lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
+ lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
+ lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
+ lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
+ lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
+ lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
+ lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
+ lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
+ lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
+ lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
+ lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
+ lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
+ lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
+ lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
+ lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
+ lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
+ lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
+ lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
+ lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
+ lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+ lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
+ lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
+ lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
+ lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
+ lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
+ lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
+ lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
+ lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
+ lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
+ lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
+ lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
+ lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
+ lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
+ lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
+ lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
+ lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+ lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+ lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+ lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
+ lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
+ lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
+ lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
+ lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+ lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm256_packs_epi32(u[0], u[1]);
+ out[16] = _mm256_packs_epi32(u[2], u[3]);
+ out[8] = _mm256_packs_epi32(u[4], u[5]);
+ out[24] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ out[4] = _mm256_packs_epi32(u[0], u[1]);
+ out[20] = _mm256_packs_epi32(u[2], u[3]);
+ out[12] = _mm256_packs_epi32(u[4], u[5]);
+ out[28] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m256i k32_m28_m04 =
+ pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m12_m20 =
+ pair256_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m256i k32_p30_p02 =
+ pair256_set_epi32(cospi_30_64, cospi_2_64);
+ const __m256i k32_p14_p18 =
+ pair256_set_epi32(cospi_14_64, cospi_18_64);
+ const __m256i k32_p22_p10 =
+ pair256_set_epi32(cospi_22_64, cospi_10_64);
+ const __m256i k32_p06_p26 =
+ pair256_set_epi32(cospi_6_64, cospi_26_64);
+ const __m256i k32_m26_p06 =
+ pair256_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m256i k32_m10_p22 =
+ pair256_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m256i k32_m18_p14 =
+ pair256_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m256i k32_m02_p30 =
+ pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[2] = _mm256_packs_epi32(u[0], u[1]);
+ out[18] = _mm256_packs_epi32(u[2], u[3]);
+ out[10] = _mm256_packs_epi32(u[4], u[5]);
+ out[26] = _mm256_packs_epi32(u[6], u[7]);
+ out[6] = _mm256_packs_epi32(u[8], u[9]);
+ out[22] = _mm256_packs_epi32(u[10], u[11]);
+ out[14] = _mm256_packs_epi32(u[12], u[13]);
+ out[30] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m256i k32_p31_p01 =
+ pair256_set_epi32(cospi_31_64, cospi_1_64);
+ const __m256i k32_p15_p17 =
+ pair256_set_epi32(cospi_15_64, cospi_17_64);
+ const __m256i k32_p23_p09 =
+ pair256_set_epi32(cospi_23_64, cospi_9_64);
+ const __m256i k32_p07_p25 =
+ pair256_set_epi32(cospi_7_64, cospi_25_64);
+ const __m256i k32_m25_p07 =
+ pair256_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m256i k32_m09_p23 =
+ pair256_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m256i k32_m17_p15 =
+ pair256_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m256i k32_m01_p31 =
+ pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[1] = _mm256_packs_epi32(u[0], u[1]);
+ out[17] = _mm256_packs_epi32(u[2], u[3]);
+ out[9] = _mm256_packs_epi32(u[4], u[5]);
+ out[25] = _mm256_packs_epi32(u[6], u[7]);
+ out[7] = _mm256_packs_epi32(u[8], u[9]);
+ out[23] = _mm256_packs_epi32(u[10], u[11]);
+ out[15] = _mm256_packs_epi32(u[12], u[13]);
+ out[31] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ const __m256i k32_p27_p05 =
+ pair256_set_epi32(cospi_27_64, cospi_5_64);
+ const __m256i k32_p11_p21 =
+ pair256_set_epi32(cospi_11_64, cospi_21_64);
+ const __m256i k32_p19_p13 =
+ pair256_set_epi32(cospi_19_64, cospi_13_64);
+ const __m256i k32_p03_p29 =
+ pair256_set_epi32(cospi_3_64, cospi_29_64);
+ const __m256i k32_m29_p03 =
+ pair256_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m256i k32_m13_p19 =
+ pair256_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m256i k32_m21_p11 =
+ pair256_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m256i k32_m05_p27 =
+ pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[5] = _mm256_packs_epi32(u[0], u[1]);
+ out[21] = _mm256_packs_epi32(u[2], u[3]);
+ out[13] = _mm256_packs_epi32(u[4], u[5]);
+ out[29] = _mm256_packs_epi32(u[6], u[7]);
+ out[3] = _mm256_packs_epi32(u[8], u[9]);
+ out[19] = _mm256_packs_epi32(u[10], u[11]);
+ out[11] = _mm256_packs_epi32(u[12], u[13]);
+ out[27] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ }
+#endif
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output_currStep, *output_nextStep;
+ tran_low_t *curr_out, *next_out;
+ // Pass 0
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ // Pass 1
+ curr_out = &output_org[column_start * 32];
+ next_out = &output_org[(column_start + 8) * 32];
+
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m256i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+ // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
+ // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
+ // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
+ // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+ // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+ // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31
+ // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71
+ // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35
+ // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75
+ // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101
+ // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+ // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115
+ // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69
+ // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73
+ // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71
+ // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75
+ // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+ // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+ // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+ // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+ __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+ // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+ // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+ // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+ // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+ // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+ // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+ // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+ __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+ __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+ __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+ __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+ __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+ __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+ __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in av1/encoder/av1_dct.c
+ tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+ }
+ if (0 == pass) {
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+ _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+ _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+ _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+ _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+ _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+ _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+ _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+ _mm256_castsi256_si128(tr2_7));
+
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+ _mm256_extractf128_si256(tr2_0, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+ _mm256_extractf128_si256(tr2_1, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+ _mm256_extractf128_si256(tr2_2, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+ _mm256_extractf128_si256(tr2_3, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+ _mm256_extractf128_si256(tr2_4, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+ _mm256_extractf128_si256(tr2_5, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+ _mm256_extractf128_si256(tr2_6, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+ _mm256_extractf128_si256(tr2_7, 1));
+ // Process next 8x8
+ output_currStep += 8;
+ output_nextStep += 8;
+ }
+ if (1 == pass) {
+ store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
+ store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
+ store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
+ store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
+ store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
+ store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
+ store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
+ store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
+ curr_out += 8;
+ next_out += 8;
+ }
+ }
+ }
+ }
+ }
+ _mm256_zeroupper();
+} // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
new file mode 100644
index 000000000..69dd6af11
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3201 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "aom_dsp/fwd_txfm.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ aom_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c
+#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c
+#else
+void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ aom_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c
+#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c
+#endif // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif // DCT_HIGH_BIT_DEPTH
+
+void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[0] = ADD_EPI16(in00, in31);
+ step1[1] = ADD_EPI16(in01, in30);
+ step1[2] = ADD_EPI16(in02, in29);
+ step1[3] = ADD_EPI16(in03, in28);
+ step1[28] = SUB_EPI16(in03, in28);
+ step1[29] = SUB_EPI16(in02, in29);
+ step1[30] = SUB_EPI16(in01, in30);
+ step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+ &step1[3], &step1[28], &step1[29],
+ &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[4] = ADD_EPI16(in04, in27);
+ step1[5] = ADD_EPI16(in05, in26);
+ step1[6] = ADD_EPI16(in06, in25);
+ step1[7] = ADD_EPI16(in07, in24);
+ step1[24] = SUB_EPI16(in07, in24);
+ step1[25] = SUB_EPI16(in06, in25);
+ step1[26] = SUB_EPI16(in05, in26);
+ step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+ &step1[7], &step1[24], &step1[25],
+ &step1[26], &step1[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[8] = ADD_EPI16(in08, in23);
+ step1[9] = ADD_EPI16(in09, in22);
+ step1[10] = ADD_EPI16(in10, in21);
+ step1[11] = ADD_EPI16(in11, in20);
+ step1[20] = SUB_EPI16(in11, in20);
+ step1[21] = SUB_EPI16(in10, in21);
+ step1[22] = SUB_EPI16(in09, in22);
+ step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[20], &step1[21],
+ &step1[22], &step1[23]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = ADD_EPI16(in12, in19);
+ step1[13] = ADD_EPI16(in13, in18);
+ step1[14] = ADD_EPI16(in14, in17);
+ step1[15] = ADD_EPI16(in15, in16);
+ step1[16] = SUB_EPI16(in15, in16);
+ step1[17] = SUB_EPI16(in14, in17);
+ step1[18] = SUB_EPI16(in13, in18);
+ step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+ &step1[15], &step1[16], &step1[17],
+ &step1[18], &step1[19]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = ADD_EPI16(step1[0], step1[15]);
+ step2[1] = ADD_EPI16(step1[1], step1[14]);
+ step2[2] = ADD_EPI16(step1[2], step1[13]);
+ step2[3] = ADD_EPI16(step1[3], step1[12]);
+ step2[4] = ADD_EPI16(step1[4], step1[11]);
+ step2[5] = ADD_EPI16(step1[5], step1[10]);
+ step2[6] = ADD_EPI16(step1[6], step1[9]);
+ step2[7] = ADD_EPI16(step1[7], step1[8]);
+ step2[8] = SUB_EPI16(step1[7], step1[8]);
+ step2[9] = SUB_EPI16(step1[6], step1[9]);
+ step2[10] = SUB_EPI16(step1[5], step1[10]);
+ step2[11] = SUB_EPI16(step1[4], step1[11]);
+ step2[12] = SUB_EPI16(step1[3], step1[12]);
+ step2[13] = SUB_EPI16(step1[2], step1[13]);
+ step2[14] = SUB_EPI16(step1[1], step1[14]);
+ step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+ &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[0] = SUB_EPI16(step2[0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[9], s2_09_0);
+ step2[10] = SUB_EPI16(step2[10], s3_10_0);
+ step2[11] = SUB_EPI16(step2[11], s3_11_0);
+ step2[12] = SUB_EPI16(step2[12], s3_12_0);
+ step2[13] = SUB_EPI16(step2[13], s3_13_0);
+ step2[14] = SUB_EPI16(step2[14], s2_14_0);
+ step2[15] = SUB_EPI16(step2[15], s2_15_0);
+ step1[16] = SUB_EPI16(step1[16], s3_16_0);
+ step1[17] = SUB_EPI16(step1[17], s3_17_0);
+ step1[18] = SUB_EPI16(step1[18], s3_18_0);
+ step1[19] = SUB_EPI16(step1[19], s3_19_0);
+ step2[20] = SUB_EPI16(step2[20], s3_20_0);
+ step2[21] = SUB_EPI16(step2[21], s3_21_0);
+ step2[22] = SUB_EPI16(step2[22], s3_22_0);
+ step2[23] = SUB_EPI16(step2[23], s3_23_0);
+ step2[24] = SUB_EPI16(step2[24], s3_24_0);
+ step2[25] = SUB_EPI16(step2[25], s3_25_0);
+ step2[26] = SUB_EPI16(step2[26], s3_26_0);
+ step2[27] = SUB_EPI16(step2[27], s3_27_0);
+ step1[28] = SUB_EPI16(step1[28], s3_28_0);
+ step1[29] = SUB_EPI16(step1[29], s3_29_0);
+ step1[30] = SUB_EPI16(step1[30], s3_30_0);
+ step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x32(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
+ &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
+ &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
+ &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ step2[0] = _mm_add_epi16(step2[0], kOne);
+ step2[1] = _mm_add_epi16(step2[1], kOne);
+ step2[2] = _mm_add_epi16(step2[2], kOne);
+ step2[3] = _mm_add_epi16(step2[3], kOne);
+ step2[4] = _mm_add_epi16(step2[4], kOne);
+ step2[5] = _mm_add_epi16(step2[5], kOne);
+ step2[6] = _mm_add_epi16(step2[6], kOne);
+ step2[7] = _mm_add_epi16(step2[7], kOne);
+ step2[8] = _mm_add_epi16(step2[8], kOne);
+ step2[9] = _mm_add_epi16(step2[9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm_srai_epi16(step2[0], 2);
+ step2[1] = _mm_srai_epi16(step2[1], 2);
+ step2[2] = _mm_srai_epi16(step2[2], 2);
+ step2[3] = _mm_srai_epi16(step2[3], 2);
+ step2[4] = _mm_srai_epi16(step2[4], 2);
+ step2[5] = _mm_srai_epi16(step2[5], 2);
+ step2[6] = _mm_srai_epi16(step2[6], 2);
+ step2[7] = _mm_srai_epi16(step2[7], 2);
+ step2[8] = _mm_srai_epi16(step2[8], 2);
+ step2[9] = _mm_srai_epi16(step2[9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
+ &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
+ &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
+ &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
+ &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+ // Stage 4
+ {
+ step1[0] = ADD_EPI16(step3[3], step3[0]);
+ step1[1] = ADD_EPI16(step3[2], step3[1]);
+ step1[2] = SUB_EPI16(step3[1], step3[2]);
+ step1[3] = SUB_EPI16(step3[0], step3[3]);
+ step1[8] = ADD_EPI16(step3[11], step2[8]);
+ step1[9] = ADD_EPI16(step3[10], step2[9]);
+ step1[10] = SUB_EPI16(step2[9], step3[10]);
+ step1[11] = SUB_EPI16(step2[8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
+ &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 5
+ {
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
+ &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 =
+ _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 =
+ _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 =
+ _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 =
+ _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 =
+ _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 =
+ _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 =
+ _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 =
+ _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
+ &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
+ &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
+ &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 =
+ _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 =
+ _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 =
+ _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 =
+ _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 =
+ _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 =
+ _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 =
+ _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 =
+ _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[8] = ADD_EPI16(step2[9], step1[8]);
+ step3[9] = SUB_EPI16(step1[8], step2[9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 =
+ _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 =
+ _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 =
+ _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 =
+ _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 =
+ _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 =
+ _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 =
+ _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 =
+ _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 =
+ _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 =
+ _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 =
+ _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 =
+ _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 =
+ _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 =
+ _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 =
+ _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 =
+ _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
+ &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
+ &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 =
+ _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 =
+ _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 =
+ _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 =
+ _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 =
+ _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 =
+ _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 =
+ _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 =
+ _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 =
+ _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 =
+ _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 =
+ _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 =
+ _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 =
+ _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 =
+ _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 =
+ _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 =
+ _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 =
+ _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 =
+ _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 =
+ _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 =
+ _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 =
+ _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 =
+ _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 =
+ _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 =
+ _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 =
+ _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 =
+ _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 =
+ _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 =
+ _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 =
+ _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 =
+ _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 =
+ _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 =
+ _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m128i lstep1[64], lstep2[64], lstep3[64];
+ __m128i u[32], v[32], sign[16];
+ const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
+ lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
+ lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
+ lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
+ lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
+ lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
+ lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
+ lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
+ lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
+ lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
+ lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
+ lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
+ lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
+ lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
+ lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
+ lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
+ lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
+ lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
+ lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
+ lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
+ lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
+ lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
+ lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
+ lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
+ lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
+ lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
+ lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
+
+ lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
+ lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
+ lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
+ lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
+ lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
+ lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
+ lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
+ lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
+ lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
+ lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
+
+ lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
+ lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
+ lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
+ lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
+ lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+
+ lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
+ &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm_packs_epi32(u[0], u[1]);
+ out[16] = _mm_packs_epi32(u[2], u[3]);
+ out[8] = _mm_packs_epi32(u[4], u[5]);
+ out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ out[4] = _mm_packs_epi32(u[0], u[1]);
+ out[20] = _mm_packs_epi32(u[2], u[3]);
+ out[12] = _mm_packs_epi32(u[4], u[5]);
+ out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m12_m20 =
+ pair_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+ const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+ const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[2] = _mm_packs_epi32(u[0], u[1]);
+ out[18] = _mm_packs_epi32(u[2], u[3]);
+ out[10] = _mm_packs_epi32(u[4], u[5]);
+ out[26] = _mm_packs_epi32(u[6], u[7]);
+ out[6] = _mm_packs_epi32(u[8], u[9]);
+ out[22] = _mm_packs_epi32(u[10], u[11]);
+ out[14] = _mm_packs_epi32(u[12], u[13]);
+ out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+ const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+ const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+ const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+ const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[1] = _mm_packs_epi32(u[0], u[1]);
+ out[17] = _mm_packs_epi32(u[2], u[3]);
+ out[9] = _mm_packs_epi32(u[4], u[5]);
+ out[25] = _mm_packs_epi32(u[6], u[7]);
+ out[7] = _mm_packs_epi32(u[8], u[9]);
+ out[23] = _mm_packs_epi32(u[10], u[11]);
+ out[15] = _mm_packs_epi32(u[12], u[13]);
+ out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+ const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+ const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+ const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+ const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[5] = _mm_packs_epi32(u[0], u[1]);
+ out[21] = _mm_packs_epi32(u[2], u[3]);
+ out[13] = _mm_packs_epi32(u[4], u[5]);
+ out[29] = _mm_packs_epi32(u[6], u[7]);
+ out[3] = _mm_packs_epi32(u[8], u[9]);
+ out[19] = _mm_packs_epi32(u[10], u[11]);
+ out[11] = _mm_packs_epi32(u[12], u[13]);
+ out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+#endif // FDCT32x32_HIGH_PRECISION
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output0 = &intermediate[column_start * 32];
+ tran_low_t *output1 = &output_org[column_start * 32];
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in av1/encoder/av1_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+ // Process next 8x8
+ output0 += 8;
+ } else {
+ storeu_output(&tr2_0, (output1 + 0 * 32));
+ storeu_output(&tr2_1, (output1 + 1 * 32));
+ storeu_output(&tr2_2, (output1 + 2 * 32));
+ storeu_output(&tr2_3, (output1 + 3 * 32));
+ storeu_output(&tr2_4, (output1 + 4 * 32));
+ storeu_output(&tr2_5, (output1 + 5 * 32));
+ storeu_output(&tr2_6, (output1 + 6 * 32));
+ storeu_output(&tr2_7, (output1 + 7 * 32));
+ // Process next 8x8
+ output1 += 8;
+ }
+ }
+ }
+ }
+ }
+} // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
new file mode 100644
index 000000000..670f864d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+
+#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
new file mode 100644
index 000000000..d3aceae00
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
+#define AOM_DSP_X86_FWD_TXFM_AVX2_H
+
+#include "./aom_config.h"
+
+static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
+#if CONFIG_HIGHBITDEPTH
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+
+ __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+ __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+
+ __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+ __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+
+ _mm256_storeu_si256((__m256i *)out, y0);
+ _mm256_storeu_si256((__m256i *)(out + 8), y1);
+#else
+ _mm256_storeu_si256((__m256i *)out, *coeff);
+#endif
+}
+
+#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 000000000..7bb1db70a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,1014 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+ __m128i cmp0, cmp1;
+ int test, overflow;
+#endif
+
+ // Load inputs.
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+#if DCT_HIGH_BIT_DEPTH
+ // Check inputs small enough to use optimised code
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+ test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+ if (test) {
+ aom_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // multiply by 16 to give some extra precision
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+ const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ aom_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ in0 = _mm_shuffle_epi32(x0, 0xD8);
+ in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+ // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+ const __m128i t0 = ADD_EPI16(in0, in1);
+ const __m128i t1 = SUB_EPI16(in0, in1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&t0, &t1);
+ if (overflow) {
+ aom_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ // w0 = [o0 o4 o8 oC]
+ // w1 = [o2 o6 oA oE]
+ // w2 = [o1 o5 o9 oD]
+ // w3 = [o3 o7 oB oF]
+ // remember the o's are numbered according to the correct output location
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ aom_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+ // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+ const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+ const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+ // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+ // y1 = [o2 o3 o6 o7 oA oB oE oF]
+ in0 = _mm_unpacklo_epi32(y0, y1);
+ // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+ in1 = _mm_unpackhi_epi32(y0, y1);
+ // in1 = [o8 o9 oA oB oC oD oE oF]
+ }
+ }
+ }
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+ const int16_t *in = input;
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = ADD_EPI16(in00, in15);
+ input1 = ADD_EPI16(in01, in14);
+ input2 = ADD_EPI16(in02, in13);
+ input3 = ADD_EPI16(in03, in12);
+ input4 = ADD_EPI16(in04, in11);
+ input5 = ADD_EPI16(in05, in10);
+ input6 = ADD_EPI16(in06, in09);
+ input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+ &input4, &input5, &input6, &input7);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = SUB_EPI16(in07, in08);
+ step1_1 = SUB_EPI16(in06, in09);
+ step1_2 = SUB_EPI16(in05, in10);
+ step1_3 = SUB_EPI16(in04, in11);
+ step1_4 = SUB_EPI16(in03, in12);
+ step1_5 = SUB_EPI16(in02, in13);
+ step1_6 = SUB_EPI16(in01, in14);
+ step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(input0, input7);
+ const __m128i q1 = ADD_EPI16(input1, input6);
+ const __m128i q2 = ADD_EPI16(input2, input5);
+ const __m128i q3 = ADD_EPI16(input3, input4);
+ const __m128i q4 = SUB_EPI16(input3, input4);
+ const __m128i q5 = SUB_EPI16(input2, input5);
+ const __m128i q6 = SUB_EPI16(input1, input6);
+ const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i r0 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m128i r1 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 3
+ {
+ step3_0 = ADD_EPI16(step1_0, step2_3);
+ step3_1 = ADD_EPI16(step1_1, step2_2);
+ step3_2 = SUB_EPI16(step1_1, step2_2);
+ step3_3 = SUB_EPI16(step1_0, step2_3);
+ step3_4 = SUB_EPI16(step1_7, step2_4);
+ step3_5 = SUB_EPI16(step1_6, step2_5);
+ step3_6 = ADD_EPI16(step1_6, step2_5);
+ step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
+ &step3_4, &step3_5, &step3_6, &step3_7);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 5
+ {
+ step1_0 = ADD_EPI16(step3_0, step2_1);
+ step1_1 = SUB_EPI16(step3_0, step2_1);
+ step1_2 = ADD_EPI16(step3_3, step2_2);
+ step1_3 = SUB_EPI16(step3_3, step2_2);
+ step1_4 = SUB_EPI16(step3_4, step2_5);
+ step1_5 = ADD_EPI16(step3_4, step2_5);
+ step1_6 = SUB_EPI16(step3_7, step2_6);
+ step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+ if (overflow) {
+ aom_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
+ &res06, &res07, pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
+ &res14, &res15, pass, out0 + 8, out1 + 8);
+ if (pass == 0) {
+ out0 += 8 * 16;
+ } else {
+ out1 += 8 * 16;
+ }
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 000000000..a337e618d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
+}
+
+void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ input += 8 * stride;
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#define FDCT16x16_2D aom_fdct16x16_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D aom_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D aom_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D aom_highbd_fdct4x4_sse2
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 000000000..26b2db2e0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#include "aom_dsp/x86/txfm_common_intrin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16(0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16(0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *zero) {
+ __m128i minus_one = _mm_set1_epi32(-1);
+ // Check for overflows
+ __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+ __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+ __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+ __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+ __m128i reg0_top_dwords =
+ _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg1_top_dwords =
+ _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg2_top_dwords =
+ _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg3_top_dwords =
+ _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+ __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+ __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+ __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+ __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+ __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+ int overflow_01 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+ int overflow_23 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+ return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
+ preg27, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
+ preg31, zero);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_HIGHBITDEPTH
+}
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
+ const __m128i *pmultiplier,
+ const __m128i *prounding, int shift) {
+ const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+ const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+ const __m128i v0 = _mm_add_epi32(u0, *prounding);
+ const __m128i v1 = _mm_add_epi32(u1, *prounding);
+ const __m128i w0 = _mm_srai_epi32(v0, shift);
+ const __m128i w1 = _mm_srai_epi32(v1, shift);
+ return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+ const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
+ const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
+ const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr,
+ tran_low_t *out1_ptr) {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
+ } else {
+ storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+ storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+ storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+ storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+ storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+ storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+ storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+ storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+ }
+}
+
+void fdct32_8col(__m128i *in0, __m128i *in1);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 000000000..8fa1c04d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192: times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
+pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585, 11585
+TRANSFORM_COEFFS 15137, 6270
+TRANSFORM_COEFFS 16069, 3196
+TRANSFORM_COEFFS 9102, 13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+ psubw m%3, m%1, m%2
+ paddw m%1, m%2
+ SWAP %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+ pmaddwd m%1, m%3, %5
+ pmaddwd m%2, m%3, %6
+ paddd m%1, %4
+ paddd m%2, %4
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+ punpckhwd m%6, m%2, m%1
+ MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4]
+ punpcklwd m%2, m%1
+ MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4]
+ packssdw m%1, m%7
+ packssdw m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+ punpckh%1 m%4, m%2, m%3
+ punpckl%1 m%2, m%3
+ SWAP %3, %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+ INTERLEAVE_2X wd, %1, %2, %9
+ INTERLEAVE_2X wd, %3, %4, %9
+ INTERLEAVE_2X wd, %5, %6, %9
+ INTERLEAVE_2X wd, %7, %8, %9
+
+ INTERLEAVE_2X dq, %1, %3, %9
+ INTERLEAVE_2X dq, %2, %4, %9
+ INTERLEAVE_2X dq, %5, %7, %9
+ INTERLEAVE_2X dq, %6, %8, %9
+
+ INTERLEAVE_2X qdq, %1, %5, %9
+ INTERLEAVE_2X qdq, %3, %7, %9
+ INTERLEAVE_2X qdq, %2, %6, %9
+ INTERLEAVE_2X qdq, %4, %8, %9
+
+ SWAP %2, %5
+ SWAP %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 1
+ SUM_SUB 0, 7, 9
+ SUM_SUB 1, 6, 9
+ SUM_SUB 2, 5, 9
+ SUM_SUB 3, 4, 9
+
+ SUM_SUB 0, 3, 9
+ SUM_SUB 1, 2, 9
+ SUM_SUB 6, 5, 9
+%if %1 == 0
+ SUM_SUB 0, 1, 9
+%endif
+
+ BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10
+
+ pmulhrsw m6, m12
+ pmulhrsw m5, m12
+%if %1 == 0
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+%else
+ BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10
+ SWAP 0, 1
+%endif
+
+ SUM_SUB 4, 5, 9
+ SUM_SUB 7, 6, 9
+ BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10
+ BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10
+ SWAP 1, 4
+ SWAP 3, 6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+ psraw m%3, m%1, 15
+ psraw m%4, m%2, 15
+ psubw m%1, m%3
+ psubw m%2, m%4
+ psraw m%1, 1
+ psraw m%2, 1
+%endmacro
+
+%macro STORE_OUTPUT 2 ; index, result
+%if CONFIG_HIGHBITDEPTH
+ ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+ ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ pxor m11, m11
+ pcmpgtw m11, m%2
+ movdqa m12, m%2
+ punpcklwd m%2, m11
+ punpckhwd m12, m11
+ mova [outputq + 4*%1 + 0], m%2
+ mova [outputq + 4*%1 + 16], m12
+%else
+ mova [outputq + 2*%1], m%2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+ mova m8, [pd_8192]
+ mova m12, [pw_11585x2]
+
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ ; left shift by 2 to increase forward transformation precision
+ psllw m0, 2
+ psllw m1, 2
+ psllw m2, 2
+ psllw m3, 2
+ psllw m4, 2
+ psllw m5, 2
+ psllw m6, 2
+ psllw m7, 2
+
+ ; column transform
+ FDCT8_1D 0
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+ FDCT8_1D 1
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+ DIVIDE_ROUND_2X 0, 1, 9, 10
+ DIVIDE_ROUND_2X 2, 3, 9, 10
+ DIVIDE_ROUND_2X 4, 5, 9, 10
+ DIVIDE_ROUND_2X 6, 7, 9, 10
+
+ STORE_OUTPUT 0, 0
+ STORE_OUTPUT 8, 1
+ STORE_OUTPUT 16, 2
+ STORE_OUTPUT 24, 3
+ STORE_OUTPUT 32, 4
+ STORE_OUTPUT 40, 5
+ STORE_OUTPUT 48, 6
+ STORE_OUTPUT 56, 7
+
+ RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
new file mode 100644
index 000000000..60446b086
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -0,0 +1,349 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
+; int ref_stride,
+; unsigned char *src,
+; int src_stride,
+; unsigned int height,
+; int *sum,
+; unsigned int *sumsquared)
+global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(aom_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref
+
+ mov rdi, arg(2) ;src
+ movsxd rcx, dword ptr arg(4) ;height
+ movsxd rax, dword ptr arg(1) ;ref_stride
+ movsxd rdx, dword ptr arg(3) ;src_stride
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+aom_half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz aom_half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_half_vert_variance16x_h_sse2(unsigned char *ref,
+; int ref_stride,
+; unsigned char *src,
+; int src_stride,
+; unsigned int height,
+; int *sum,
+; unsigned int *sumsquared)
+global sym(aom_half_vert_variance16x_h_sse2) PRIVATE
+sym(aom_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref
+
+ mov rdi, arg(2) ;src
+ movsxd rcx, dword ptr arg(4) ;height
+ movsxd rax, dword ptr arg(1) ;ref_stride
+ movsxd rdx, dword ptr arg(3) ;src_stride
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+aom_half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz aom_half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref,
+; int ref_stride
+; unsigned char *src,
+; int src_stride,
+; unsigned int height,
+; int *sum,
+; unsigned int *sumsquared)
+global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE
+sym(aom_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref
+
+ mov rdi, arg(2) ;src
+ movsxd rcx, dword ptr arg(4) ;height
+ movsxd rax, dword ptr arg(1) ;ref_stride
+ movsxd rdx, dword ptr arg(3) ;src_stride
+
+ pxor xmm0, xmm0 ;
+
+aom_half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz aom_half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+aom_bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
new file mode 100644
index 000000000..a99c0b40e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
+ int ref_stride,
+ const unsigned char *src,
+ int src_stride, unsigned int height,
+ int *sum, unsigned int *sumsquared);
+void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+ const unsigned char *src, int src_stride,
+ unsigned int height, int *sum,
+ unsigned int *sumsquared);
+void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+ const unsigned char *src, int src_stride,
+ unsigned int height, int *sum,
+ unsigned int *sumsquared);
+
+uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
+ int src_stride,
+ const unsigned char *dst,
+ int dst_stride, uint32_t *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ assert(xsum0 <= 255 * 16 * 16);
+ assert(xsum0 >= -255 * 16 * 16);
+ return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
+ int src_stride,
+ const unsigned char *dst,
+ int dst_stride, uint32_t *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+ aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
+ &xxsum0);
+
+ *sse = xxsum0;
+ assert(xsum0 <= 255 * 16 * 16);
+ assert(xsum0 >= -255 * 16 * 16);
+ return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
+ int src_stride,
+ const unsigned char *dst,
+ int dst_stride, uint32_t *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ assert(xsum0 <= 255 * 16 * 16);
+ assert(xsum0 >= -255 * 16 * 16);
+ return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 000000000..7d96e26ae
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+#define CONV8_ROUNDING_BITS (7)
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+typedef enum { PACK_8x1, PACK_8x2, PACK_16x1 } PixelPackFormat;
+
+typedef void (*WritePixels)(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch);
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int width, int h, int bd) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ assert(width % 4 == 0);
+ if (width > 32) { // width = 64
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+ _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 16) { // width = 32
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 8) { // width = 16
+ __m256i p0, p1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+ p1 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (width > 4) { // width = 8
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storeu_si128((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storeu_si128((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // width = 4
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int width, int h, int bd) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ assert(width % 4 == 0);
+ if (width > 32) { // width = 64
+ __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+ u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+ _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 16) { // width = 32
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 8) { // width = 16
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+ _mm256_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else if (width > 4) { // width = 8
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadu_si128((const __m128i *)dst);
+ u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+ _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else { // width = 4
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadl_epi64((const __m128i *)dst);
+ u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_pixels_with_format(const uint16_t *src,
+ PixelPackFormat fmt,
+ ptrdiff_t stride, __m256i *x) {
+ switch (fmt) {
+ case PACK_8x1: {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+ break;
+ }
+ case PACK_8x2: {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+ break;
+ }
+ case PACK_16x1: {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+ break;
+ }
+ default: { assert(0); }
+ }
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, const ptrdiff_t pitch,
+ __m256i *x /*x[4]*/) {
+ pack_pixels_with_format(src, PACK_8x1, pitch, x);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, const ptrdiff_t pitch,
+ __m256i *x /*x[8]*/) {
+ pack_pixels_with_format(src, PACK_8x2, pitch, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, const ptrdiff_t pitch,
+ __m256i *x /*x[8]*/) {
+ pack_pixels_with_format(src, PACK_16x1, pitch, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+}
+
+static void write_8x1_pixels(const __m256i *y, const __m256i *z,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ (void)z;
+ (void)pitch;
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void write_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static void write_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t dst_pitch) {
+ (void)dst_pitch;
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void filter_block_width8_horiz(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1,
+ const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch,
+ uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ }
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_horiz(src, src_pitch, write_8x1_pixels, write_8x2_pixels,
+ dst, dst_pitch, height, filter, bd);
+}
+
+static void filter_block_width16_horiz(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch,
+ const WritePixels write_16x1,
+ uint16_t *dst_ptr, ptrdiff_t dst_pitch,
+ uint32_t height, const int16_t *filter,
+ int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_horiz(src, src_pitch, write_16x1_pixels, dst, dst_pitch,
+ height, filter, bd);
+}
+
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static INLINE void filter_16x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void filter_block_width8_2t_horiz(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1,
+ const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch,
+ uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_8x2_2t_pixels(signal, &ff, &res0, &res1);
+ write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ }
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_2t_horiz(src, src_pitch, write_8x1_pixels,
+ write_8x2_pixels, dst, dst_pitch, height, filter,
+ bd);
+}
+
+static void filter_block_width16_2t_horiz(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch,
+ const WritePixels write_16x1,
+ uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16x1_2t_pixels(signal, &ff, &res0, &res1);
+ write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_2t_horiz(src, src_pitch, write_16x1_pixels, dst,
+ dst_pitch, height, filter, bd);
+}
+
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static INLINE void write_8x1_pixels_ver(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)pitch;
+ const __m128i v0 = _mm256_castsi256_si128(*y0);
+ const __m128i v1 = _mm256_castsi256_si128(*y1);
+ __m128i p = _mm_packus_epi32(v0, v1);
+ p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, p);
+}
+
+static void filter_block_width8_vert(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, WritePixels write_8x1,
+ WritePixels write_8x2, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_vert(src, src_pitch, write_8x1_pixels_ver,
+ write_8x2_pixels, dst, dst_pitch, height, filter,
+ bd);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+}
+
+static INLINE void write_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void write_16x1_pixels_ver(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)y1;
+ (void)pitch;
+ const __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void filter_block_width16_vert(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch,
+ WritePixels write_16x1,
+ WritePixels write_16x2, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ write_16x2(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+ }
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_vert(src, src_pitch, write_16x1_pixels_ver,
+ write_16x2_pixels, dst, dst_pitch, height, filter,
+ bd);
+}
+
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void filter_block_width16_2t_vert(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch,
+ WritePixels write_16x1,
+ uint16_t *dst_ptr, ptrdiff_t dst_pitch,
+ uint32_t height, const int16_t *filter,
+ int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_2t_vert(src, src_pitch, write_16x1_pixels, dst,
+ dst_pitch, height, filter, bd);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static void write_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+typedef void (*Write8Pixels)(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst);
+
+static void filter_block_width8_2t_vert(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch,
+ Write8Pixels write_8x1,
+ uint16_t *dst_ptr, ptrdiff_t dst_pitch,
+ uint32_t height, const int16_t *filter,
+ int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ write_8x1(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_pixels_ver, dst,
+ dst_pitch, height, filter, bd);
+}
+
+// Calculation with averaging the input pixels
+
+static void write_8x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)y1;
+ (void)pitch;
+ const __m128i a0 = _mm256_castsi256_si128(*y0);
+ const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void write_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+ const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+ const __m256i pix =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static void write_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)pitch;
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void write_8x1_avg_pixels_ver(const __m256i *y0,
+ const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)pitch;
+ const __m128i v0 = _mm256_castsi256_si128(*y0);
+ const __m128i v1 = _mm256_castsi256_si128(*y1);
+ __m128i p = _mm_packus_epi32(v0, v1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask));
+ p = _mm_avg_epu16(p, pix);
+ _mm_storeu_si128((__m128i *)dst, p);
+}
+
+static INLINE void write_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+ const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ p = _mm256_avg_epu16(p, pix0);
+ _mm256_storeu_si256((__m256i *)dst, p);
+
+ p = _mm256_min_epi16(*y1, *mask);
+ p = _mm256_avg_epu16(p, pix1);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void write_16x1_avg_pixels_ver(const __m256i *y0,
+ const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ (void)y1;
+ (void)pitch;
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+ p = _mm256_avg_epu16(p, pix);
+ _mm256_storeu_si256((__m256i *)dst, p);
+}
+
+static void write_8x1_2t_avg_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, *mask);
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_h8_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_horiz(src, src_pitch, write_8x1_avg_pixels,
+ write_8x2_avg_pixels, dst, dst_pitch, height,
+ filter, bd);
+}
+
+static void aom_highbd_filter_block1d16_h8_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_horiz(src, src_pitch, write_16x1_avg_pixels, dst,
+ dst_pitch, height, filter, bd);
+}
+
+static void aom_highbd_filter_block1d8_v8_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_vert(src, src_pitch, write_8x1_avg_pixels_ver,
+ write_8x2_avg_pixels, dst, dst_pitch, height, filter,
+ bd);
+}
+
+static void aom_highbd_filter_block1d16_v8_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_vert(src, src_pitch, write_16x1_avg_pixels_ver,
+ write_16x2_avg_pixels, dst, dst_pitch, height,
+ filter, bd);
+}
+
+// 2-tap averaging
+
+static void aom_highbd_filter_block1d8_h2_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_2t_horiz(src, src_pitch, write_8x1_avg_pixels,
+ write_8x2_avg_pixels, dst, dst_pitch, height,
+ filter, bd);
+}
+
+static void aom_highbd_filter_block1d16_h2_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_2t_horiz(src, src_pitch, write_16x1_avg_pixels, dst,
+ dst_pitch, height, filter, bd);
+}
+
+static void aom_highbd_filter_block1d16_v2_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width16_2t_vert(src, src_pitch, write_16x1_avg_pixels, dst,
+ dst_pitch, height, filter, bd);
+}
+
+static void aom_highbd_filter_block1d8_v2_avg_avx2(
+ const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_avg_pixels_ver, dst,
+ dst_pitch, height, filter, bd);
+}
+
+typedef void HbdFilter1dFunc(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t,
+ uint32_t, const int16_t *, int);
+
+#define HIGHBD_FUNC(width, dir, avg, opt) \
+ aom_highbd_filter_block1d##width##_##dir##_##avg##opt
+
+HbdFilter1dFunc HIGHBD_FUNC(4, h8, , sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, h2, , sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, v8, , sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, v2, , sse2);
+
+#define aom_highbd_filter_block1d4_h8_avx2 HIGHBD_FUNC(4, h8, , sse2)
+#define aom_highbd_filter_block1d4_h2_avx2 HIGHBD_FUNC(4, h2, , sse2)
+#define aom_highbd_filter_block1d4_v8_avx2 HIGHBD_FUNC(4, v8, , sse2)
+#define aom_highbd_filter_block1d4_v2_avx2 HIGHBD_FUNC(4, v2, , sse2)
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_2D(, avx2);
+
+HbdFilter1dFunc HIGHBD_FUNC(4, h8, avg_, sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, h2, avg_, sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, v8, avg_, sse2);
+HbdFilter1dFunc HIGHBD_FUNC(4, v2, avg_, sse2);
+
+#define aom_highbd_filter_block1d4_h8_avg_avx2 HIGHBD_FUNC(4, h8, avg_, sse2)
+#define aom_highbd_filter_block1d4_h2_avg_avx2 HIGHBD_FUNC(4, h2, avg_, sse2)
+#define aom_highbd_filter_block1d4_v8_avg_avx2 HIGHBD_FUNC(4, v8, avg_, sse2)
+#define aom_highbd_filter_block1d4_v2_avg_avx2 HIGHBD_FUNC(4, v2, avg_, sse2)
+
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ avx2);
+HIGH_FUN_CONV_2D(avg_, avx2);
+
+#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
new file mode 100644
index 000000000..5d84ef8a7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
@@ -0,0 +1,456 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ paddw m0, m2
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, one
+ mov oned, 0x00010001
+ lea stride3q, [strideq*3]
+ movd m3, oned
+ pshufd m3, m3, 0x0
+ paddw m0, m2
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ paddw m0, [GLOBAL(pw_8)]
+ psrlw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m3, [aboveq+16]
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_16)]
+ psrad m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ paddw m0, m2
+ paddw m3, m4
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ mova m5, [leftq+32]
+ mova m6, [leftq+48]
+ paddw m2, m4
+ paddw m5, m6
+ paddw m0, m3
+ paddw m2, m5
+ pxor m1, m1
+ paddw m0, m2
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_32)]
+ psrad m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16 ], m0
+ mova [dstq +32 ], m0
+ mova [dstq +48 ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16 ], m0
+ mova [dstq+strideq*2+32 ], m0
+ mova [dstq+strideq*2+48 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4+16 ], m0
+ mova [dstq+strideq*4+32 ], m0
+ mova [dstq+strideq*4+48 ], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m0
+ mova [dstq+stride3q*2 +32], m0
+ mova [dstq+stride3q*2 +48], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m1
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ mova m2, [aboveq+32]
+ mova m3, [aboveq+48]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq +32], m2
+ mova [dstq +48], m3
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*2 +32], m2
+ mova [dstq+strideq*2 +48], m3
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+strideq*4 +32], m2
+ mova [dstq+strideq*4 +48], m3
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m1
+ mova [dstq+stride3q*2 +32], m2
+ mova [dstq+stride3q*2 +48], m3
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+ movd m1, [aboveq-2]
+ movq m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
+ movlhps m1, m1 ; tl tl tl tl tl tl tl tl
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bpsd
+ psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
+ psllw m3, m4
+ pcmpeqw m2, m2
+ pxor m4, m4 ; min possible value
+ pxor m3, m2 ; max possible value
+ mova m1, [leftq]
+ pshuflw m2, m1, 0x0
+ pshuflw m5, m1, 0x55
+ movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ pshuflw m2, m1, 0xaa
+ pshuflw m5, m1, 0xff
+ movlhps m2, m5
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+ movd m1, [aboveq-2]
+ mova m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ mov oned, 1
+ pxor m3, m3
+ pxor m4, m4
+ pinsrw m3, oned, 0
+ pinsrw m4, bpsd, 0
+ pshuflw m3, m3, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ punpcklqdq m3, m3
+ mov lineq, -4
+ mova m2, m3
+ punpcklqdq m1, m1
+ psllw m3, m4
+ add leftq, 16
+ psubw m3, m2 ; max possible value
+ pxor m4, m4 ; min possible value
+ psubw m0, m1
+.loop:
+ movd m1, [leftq+lineq*4]
+ movd m2, [leftq+lineq*4+2]
+ pshuflw m1, m1, 0x0
+ pshuflw m2, m2, 0x0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ paddw m1, m0
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m1, m3
+ pminsw m2, m3
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ ;Store the values
+ mova [dstq ], m1
+ mova [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+ movd m2, [aboveq-2]
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ pshuflw m2, m2, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bpsd
+ punpcklqdq m2, m2
+ psllw m3, m4
+ pcmpeqw m5, m5
+ pxor m4, m4 ; min possible value
+ pxor m3, m5 ; max possible value
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -8
+ psubw m0, m2
+ psubw m1, m2
+.loop:
+ movd m7, [leftq]
+ pshuflw m5, m7, 0x0
+ pshuflw m2, m7, 0x55
+ punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1
+ punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1
+ paddw m5, m1 ; t5-tl+l1 to t8-tl+l1
+ pminsw m6, m3
+ pminsw m5, m3
+ pmaxsw m6, m4 ; Clamp to the bit-depth
+ pmaxsw m5, m4
+ mova [dstq ], m6
+ mova [dstq +16], m5
+ paddw m6, m2, m0
+ paddw m2, m1
+ pminsw m6, m3
+ pminsw m2, m3
+ pmaxsw m6, m4
+ pmaxsw m2, m4
+ mova [dstq+strideq*2 ], m6
+ mova [dstq+strideq*2+16], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ lea leftq, [leftq+4]
+
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+ movd m0, [aboveq-2]
+ mova m1, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ pshuflw m0, m0, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m5, m5
+ movd m6, bpsd
+ psllw m5, m6
+ pcmpeqw m7, m7
+ pxor m6, m6 ; min possible value
+ pxor m5, m7 ; max possible value
+ punpcklqdq m0, m0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ psubw m1, m0
+ psubw m2, m0
+ psubw m3, m0
+ psubw m4, m0
+.loop:
+ movd m7, [leftq]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +48], m0
+ movd m7, [leftq+2]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2 ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+48], m0
+ lea dstq, [dstq+strideq*4]
+ lea leftq, [leftq+4]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 000000000..76369871b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+ __m128i ubounded;
+ __m128i lbounded;
+ __m128i retval;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i t80, max, min;
+
+ if (bd == 8) {
+ t80 = _mm_set1_epi16(0x80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+ } else if (bd == 10) {
+ t80 = _mm_set1_epi16(0x200);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+ } else { // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+ }
+
+ min = _mm_subs_epi16(zero, t80);
+
+ ubounded = _mm_cmpgt_epi16(value, max);
+ lbounded = _mm_cmplt_epi16(value, min);
+ retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+ ubounded = _mm_and_si128(ubounded, max);
+ lbounded = _mm_and_si128(lbounded, min);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_or_si128(retval, lbounded);
+ return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i blimit, limit, thresh;
+ __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+ __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+ __m128i ps1, qs1, ps0, qs0;
+ __m128i abs_p0q0, abs_p1q1, ffff, work;
+ __m128i filt, work_a, filter1, filter2;
+ __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+ __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+ __m128i flat2_q0, flat2_p0;
+ __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+ __m128i t4, t3, t80, t1;
+ __m128i eight, four;
+
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ }
+
+ q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+ p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+ q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+ p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+ q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+ p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+ q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+ p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+ q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+ // highbd_filter_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+ // highbd_hev_mask (in C code this is actually called from highbd_filter4)
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
+
+ // lp filter
+ // highbd_filter4
+ t4 = _mm_set1_epi16(4);
+ t3 = _mm_set1_epi16(3);
+ if (bd == 8)
+ t80 = _mm_set1_epi16(0x80);
+ else if (bd == 10)
+ t80 = _mm_set1_epi16(0x200);
+ else // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+
+ t1 = _mm_set1_epi16(0x1);
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+
+ filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
+ hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ filter1 = _mm_srai_epi16(filter1, 0x3);
+ filter2 = _mm_srai_epi16(filter2, 0x3);
+
+ qs0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ ps0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(hev, filt);
+ qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ // end highbd_filter4
+ // loopfilter done
+
+ // highbd_flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ flat = _mm_max_epi16(work, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ // end flat_mask4
+
+ // flat & mask = flat && mask (as used in filter8)
+ // (because, in both vars, each block of 16 either all 1s or all 0s)
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+ q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+ p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+ q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+ p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+ q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+ // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+ // but referred to as p0-p4 & q0-q4 in fn)
+ flat2 = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
+ _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
+ _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
+ _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
+ _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ if (bd == 8)
+ flat2 = _mm_subs_epu16(flat2, one);
+ else if (bd == 10)
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+ flat2 = _mm_cmpeq_epi16(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ // end highbd_flat_mask5
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ eight = _mm_set1_epi16(8);
+ four = _mm_set1_epi16(4);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ flat2_p0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
+ flat2_q0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
+ flat_p0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
+ flat_q0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
+
+ sum_p7 = _mm_add_epi16(p7, p7);
+ sum_q7 = _mm_add_epi16(q7, q7);
+ sum_p3 = _mm_add_epi16(p3, p3);
+ sum_q3 = _mm_add_epi16(q3, q3);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+ flat2_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+ flat2_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+ flat_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
+ flat_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ sum_p3 = _mm_add_epi16(sum_p3, p3);
+ sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+ flat2_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
+ flat2_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+ flat_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
+ flat_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+ flat2_p3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
+ flat2_q3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+ flat2_p4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
+ flat2_q4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+ flat2_p5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
+ flat2_q5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+ flat2_p6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
+ flat2_q6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // highbd_filter8
+ p2 = _mm_andnot_si128(flat, p2);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ // when (flat && mask)
+ p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
+
+ ps1 = _mm_andnot_si128(flat, ps1);
+ // p1 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ // when (flat && mask)
+ p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
+ qs1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
+
+ ps0 = _mm_andnot_si128(flat, ps0);
+ // p0 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ // when (flat && mask)
+ p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
+ qs0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
+ // end highbd_filter8
+
+ // highbd_filter16
+ p6 = _mm_andnot_si128(flat2, p6);
+ // p6 remains unchanged if !(flat2 && flat && mask)
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ // get values for when (flat2 && flat && mask)
+ p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
+ q6 = _mm_andnot_si128(flat2, q6);
+ // q6 remains unchanged if !(flat2 && flat && mask)
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ // get values for when (flat2 && flat && mask)
+ q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
+ _mm_store_si128((__m128i *)(s - 7 * p), p6);
+ _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ // p5 remains unchanged if !(flat2 && flat && mask)
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ // get values for when (flat2 && flat && mask)
+ p5 = _mm_or_si128(p5, flat2_p5);
+ // full list of p5 values
+ q5 = _mm_andnot_si128(flat2, q5);
+ // q5 remains unchanged if !(flat2 && flat && mask)
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ // get values for when (flat2 && flat && mask)
+ q5 = _mm_or_si128(q5, flat2_q5);
+ // full list of q5 values
+ _mm_store_si128((__m128i *)(s - 6 * p), p5);
+ _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ // p4 remains unchanged if !(flat2 && flat && mask)
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ // get values for when (flat2 && flat && mask)
+ p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
+ q4 = _mm_andnot_si128(flat2, q4);
+ // q4 remains unchanged if !(flat2 && flat && mask)
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ // get values for when (flat2 && flat && mask)
+ q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
+ _mm_store_si128((__m128i *)(s - 5 * p), p4);
+ _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ // get values for when (flat2 && flat && mask)
+ p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
+ q3 = _mm_andnot_si128(flat2, q3);
+ // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ // get values for when (flat2 && flat && mask)
+ q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
+ _mm_store_si128((__m128i *)(s - 4 * p), p3);
+ _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ // get values for when (flat2 && flat && mask)
+ p2 = _mm_or_si128(p2, flat2_p2);
+ // full list of p2 values
+ q2 = _mm_andnot_si128(flat2, q2);
+ // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ // get values for when (flat2 && flat && mask)
+ q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
+ _mm_store_si128((__m128i *)(s - 3 * p), p2);
+ _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ // get values for when (flat2 && flat && mask)
+ p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
+ q1 = _mm_andnot_si128(flat2, q1);
+ // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ // get values for when (flat2 && flat && mask)
+ q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
+ _mm_store_si128((__m128i *)(s - 2 * p), p1);
+ _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ // get values for when (flat2 && flat && mask)
+ p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
+ q0 = _mm_andnot_si128(flat2, q0);
+ // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ // get values for when (flat2 && flat && mask)
+ q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
+ _mm_store_si128((__m128i *)(s - 1 * p), p0);
+ _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
+ aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+ const __m128i zero = _mm_set1_epi16(0);
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+ __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+ __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+ __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+ __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+ __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+ __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft;
+
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ t80 = _mm_set1_epi16(0x200);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ t80 = _mm_set1_epi16(0x800);
+ }
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+
+ // filter_mask and hev_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_max_epi16(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ mask = _mm_max_epi16(abs_q1q0, mask);
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(abs_q1q0, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+ // lp filter
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = signed_char_clamp_bd_sse2(filt, bd);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi16(filt, t4);
+ filter2 = _mm_adds_epi16(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+ filter1 = _mm_srai_epi16(filter1, 3);
+
+ // Filter2 >> 3
+ filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+ filter2 = _mm_srai_epi16(filter2, 3);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_store_si128((__m128i *)(s - 3 * p), p2);
+ _mm_store_si128((__m128i *)(s - 2 * p), p1);
+ _mm_store_si128((__m128i *)(s - 1 * p), p0);
+ _mm_store_si128((__m128i *)(s + 0 * p), q0);
+ _mm_store_si128((__m128i *)(s + 1 * p), q1);
+ _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+ aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ __m128i work;
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ __m128i tff80;
+ __m128i tffe0;
+ __m128i t1f;
+ // equivalent to shifting 0x1f left by bitdepth - 8
+ // and setting new bits to 1
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i t7f;
+ // equivalent to shifting 0x7f left by bitdepth - 8
+ // and setting new bits to 1
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ tff80 = _mm_set1_epi16(0xff80);
+ tffe0 = _mm_set1_epi16(0xffe0);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+ }
+
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_max_epi16(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // filter4
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm_and_si128(filter1, t1f); // clamp the range
+ filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, tffe0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ work_a = _mm_cmpgt_epi16(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, tff80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ p0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+ aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+ int out_p, int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ uint16_t *in = src[idx8x8];
+ uint16_t *out = dst[idx8x8];
+
+ p0 =
+ _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ p1 =
+ _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ p2 =
+ _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ p3 =
+ _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ p4 =
+ _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ p5 =
+ _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ p6 =
+ _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ p7 =
+ _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13
+ x0 = _mm_unpacklo_epi16(p0, p1);
+ // 20 30 21 31 22 32 23 33
+ x1 = _mm_unpacklo_epi16(p2, p3);
+ // 40 50 41 51 42 52 43 53
+ x2 = _mm_unpacklo_epi16(p4, p5);
+ // 60 70 61 71 62 72 63 73
+ x3 = _mm_unpacklo_epi16(p6, p7);
+ // 00 10 20 30 01 11 21 31
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 40 50 60 70 41 51 61 71
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 00 10 20 30 40 50 60 70
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 01 11 21 31 41 51 61 71
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
+ // 00 10 20 30 40 50 60 70
+ _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
+ // 01 11 21 31 41 51 61 71
+
+ // 02 12 22 32 03 13 23 33
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 42 52 62 72 43 53 63 73
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 02 12 22 32 42 52 62 72
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
+ // 02 12 22 32 42 52 62 72
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
+ // 03 13 23 33 43 53 63 73
+
+ // 04 14 05 15 06 16 07 17
+ x0 = _mm_unpackhi_epi16(p0, p1);
+ // 24 34 25 35 26 36 27 37
+ x1 = _mm_unpackhi_epi16(p2, p3);
+ // 44 54 45 55 46 56 47 57
+ x2 = _mm_unpackhi_epi16(p4, p5);
+ // 64 74 65 75 66 76 67 77
+ x3 = _mm_unpackhi_epi16(p6, p7);
+ // 04 14 24 34 05 15 25 35
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 44 54 64 74 45 55 65 75
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 04 14 24 34 44 54 64 74
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 05 15 25 35 45 55 65 75
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
+ // 04 14 24 34 44 54 64 74
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
+ // 05 15 25 35 45 55 65 75
+
+ // 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 06 16 26 36 46 56 66 76
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
+ // 06 16 26 36 46 56 66 76
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
+ // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+ uint16_t *out, int out_p) {
+ uint16_t *src0[1];
+ uint16_t *src1[1];
+ uint16_t *dest0[1];
+ uint16_t *dest1[1];
+ src0[0] = in0;
+ src1[0] = in1;
+ dest0[0] = out;
+ dest1[0] = out + 8;
+ highbd_transpose(src0, in_p, dest0, out_p, 1);
+ highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 1);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 1);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+
+void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ highbd_transpose(src, p, dst, 8, 2);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+ bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 2);
+}
+
+void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+ // Transpose 16x16
+ highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+ highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
+ thresh, bd);
+
+ // Transpose back
+ highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+ highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 000000000..3ee24ab16
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ }
+ *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 000000000..0c7cb3998
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,290 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 000000000..8427b891c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,366 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+ mov n_rowsd, %1/2
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m2
+ psubusw m2, [srcq+src_strideq*2]
+ por m2, m5
+ mova m5, [srcq+src_strideq*4]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*4]
+ por m3, m5
+ mova m5, [srcq+src_stride3q*2]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_stride3q*2]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 000000000..797e9c1d4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1040 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+ add srcq, src_stridemp
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, \
+ sse, g_bilin_filter, g_pw_8
+ %define block_height heightd
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define block_height heightd
+ %define sec_str sec_strideq
+ %else
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar block_height, 1
+%endif
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 000000000..7bc8a0df3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred,
+ ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+ store_diff = (int64_t *)(diff + 4 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x4);
+ store_diff = (int64_t *)(diff + 5 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x5);
+ store_diff = (int64_t *)(diff + 6 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x6);
+ store_diff = (int64_t *)(diff + 7 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+ _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+ _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+ _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+ _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 3;
+ src += src_stride << 3;
+ pred += pred_stride << 3;
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 8;
+ src += 8;
+ pred += 8;
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 3;
+ src += src_stride << 3;
+ pred += pred_stride << 3;
+ subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 4;
+ src += src_stride << 4;
+ pred += pred_stride << 4;
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 16;
+ src += 16;
+ pred += 16;
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 4;
+ src += src_stride << 4;
+ pred += pred_stride << 4;
+ subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 5;
+ src += src_stride << 5;
+ pred += pred_stride << 5;
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 32;
+ src += 32;
+ pred += 32;
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 5;
+ src += src_stride << 5;
+ pred += pred_stride << 5;
+ subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 6;
+ src += src_stride << 6;
+ pred += pred_stride << 6;
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 64;
+ src += 64;
+ pred += 64;
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 6;
+ src += src_stride << 6;
+ pred += pred_stride << 6;
+ subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+ SubtractWxHFuncType ret_func_ptr = NULL;
+ if (rows == 4) {
+ if (cols == 4) {
+ ret_func_ptr = subtract_4x4;
+ } else if (cols == 8) {
+ ret_func_ptr = subtract_8x4;
+ }
+ } else if (rows == 8) {
+ if (cols == 4) {
+ ret_func_ptr = subtract_4x8;
+ } else if (cols == 8) {
+ ret_func_ptr = subtract_8x8;
+ } else if (cols == 16) {
+ ret_func_ptr = subtract_16x8;
+ }
+ } else if (rows == 16) {
+ if (cols == 8) {
+ ret_func_ptr = subtract_8x16;
+ } else if (cols == 16) {
+ ret_func_ptr = subtract_16x16;
+ } else if (cols == 32) {
+ ret_func_ptr = subtract_32x16;
+ }
+ } else if (rows == 32) {
+ if (cols == 16) {
+ ret_func_ptr = subtract_16x32;
+ } else if (cols == 32) {
+ ret_func_ptr = subtract_32x32;
+ } else if (cols == 64) {
+ ret_func_ptr = subtract_64x32;
+ }
+ } else if (rows == 64) {
+ if (cols == 32) {
+ ret_func_ptr = subtract_32x64;
+ } else if (cols == 64) {
+ ret_func_ptr = subtract_64x64;
+ } else if (cols == 128) {
+ ret_func_ptr = subtract_128x64;
+ }
+ } else if (rows == 128) {
+ if (cols == 64) {
+ ret_func_ptr = subtract_64x128;
+ } else if (cols == 128) {
+ ret_func_ptr = subtract_128x128;
+ }
+ }
+ if (!ret_func_ptr) {
+ assert(0);
+ }
+ return ret_func_ptr;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ SubtractWxHFuncType func;
+ (void)bd;
+
+ func = getSubtractFunc(rows, cols);
+ func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 000000000..cf8ea498c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,316 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(aom_highbd_calc16x16var_sse2) PRIVATE
+sym(aom_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(aom_highbd_calc8x8var_sse2) PRIVATE
+sym(aom_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 000000000..29f96ce24
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define HIGH_GET_VAR(S) \
+ void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ } \
+ \
+ void aom_highbd_10_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ } \
+ \
+ void aom_highbd_12_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ }
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ } \
+ \
+ uint32_t aom_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(sse2);
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ int64_t var; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
+#define DECLS(opt1) \
+ DECL(16, opt1) \
+ DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ int64_t var; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt1) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+ FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+ const uint8_t *ref8, int ref_stride) {
+ int i, j;
+ int stride = ref_stride << 3;
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ if (width >= 8) {
+ // read 8 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+ __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+ __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+ __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+ __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+ __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+ __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+ __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+ __m128i t0, t1, t2, t3;
+
+ t0 = _mm_unpacklo_epi16(s0, s1);
+ t1 = _mm_unpacklo_epi16(s2, s3);
+ t2 = _mm_unpacklo_epi16(s4, s5);
+ t3 = _mm_unpacklo_epi16(s6, s7);
+ t0 = _mm_unpacklo_epi32(t0, t1);
+ t2 = _mm_unpacklo_epi32(t2, t3);
+ t0 = _mm_unpacklo_epi64(t0, t2);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), t0);
+ comp_pred += 8;
+ ref += 64; // 8 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else {
+ // read 4 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+ __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+ __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+ __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+ __m128i t0, t1;
+
+ t0 = _mm_unpacklo_epi16(s0, s1);
+ t1 = _mm_unpacklo_epi16(s2, s3);
+ t0 = _mm_unpacklo_epi32(t0, t1);
+
+ _mm_storel_epi64((__m128i *)(comp_pred), t0);
+ comp_pred += 4;
+ ref += 4 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
+ const uint8_t *pred8, int width,
+ int height, const uint8_t *ref8,
+ int ref_stride) {
+ const __m128i one = _mm_set1_epi16(1);
+ int i, j;
+ int stride = ref_stride << 3;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ if (width >= 8) {
+ // read 8 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+ __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+ __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+ __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+ __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+ __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+ __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+ __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i t0, t1, t2, t3;
+
+ t0 = _mm_unpacklo_epi16(s0, s1);
+ t1 = _mm_unpacklo_epi16(s2, s3);
+ t2 = _mm_unpacklo_epi16(s4, s5);
+ t3 = _mm_unpacklo_epi16(s6, s7);
+ t0 = _mm_unpacklo_epi32(t0, t1);
+ t2 = _mm_unpacklo_epi32(t2, t3);
+ t0 = _mm_unpacklo_epi64(t0, t2);
+
+ p0 = _mm_adds_epu16(t0, p0);
+ p0 = _mm_adds_epu16(p0, one);
+ p0 = _mm_srli_epi16(p0, 1);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), p0);
+ comp_pred += 8;
+ pred += 8;
+ ref += 8 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else {
+ // read 4 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+ __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+ __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+ __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+ __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+ __m128i t0, t1;
+
+ t0 = _mm_unpacklo_epi16(s0, s1);
+ t1 = _mm_unpacklo_epi16(s2, s3);
+ t0 = _mm_unpacklo_epi32(t0, t1);
+
+ p0 = _mm_adds_epu16(t0, p0);
+ p0 = _mm_adds_epu16(p0, one);
+ p0 = _mm_srli_epi16(p0, 1);
+
+ _mm_storel_epi64((__m128i *)(comp_pred), p0);
+ comp_pred += 4;
+ pred += 4;
+ ref += 4 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 000000000..cc7f52811
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ uint64_t *sse, int64_t *sum) {
+ __m128i u0, u1, u2, u3;
+ __m128i s0, s1, s2, s3;
+ __m128i t0, t1, x0, y0;
+ __m128i a0, a1, a2, a3;
+ __m128i b0, b1, b2, b3;
+ __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+ a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+ a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+ a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+ b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+ b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+ b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+ b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+ u0 = _mm_unpacklo_epi16(a0, a1);
+ u1 = _mm_unpacklo_epi16(a2, a3);
+ u2 = _mm_unpacklo_epi16(b0, b1);
+ u3 = _mm_unpacklo_epi16(b2, b3);
+
+ s0 = _mm_sub_epi16(u0, u2);
+ s1 = _mm_sub_epi16(u1, u3);
+
+ t0 = _mm_madd_epi16(s0, k_one_epi16);
+ t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ y0 = _mm_hadd_epi32(s3, s3);
+
+ t0 = _mm_madd_epi16(s0, s0);
+ t1 = _mm_madd_epi16(s1, s1);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ x0 = _mm_hadd_epi32(s3, s3);
+
+ *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+ *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)local_sse;
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+ sum = ROUND_POWER_OF_TWO(sum, 2);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+ sum = ROUND_POWER_OF_TWO(sum, 4);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+ 4);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+ 4);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+ 4);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm
new file mode 100644
index 000000000..02567db49
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm
@@ -0,0 +1,771 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
+
+SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movd m2, [leftq]
+ movd m0, [aboveq]
+ pxor m1, m1
+ punpckldq m0, m2
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_8)]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_16)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_32)]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+ movifnidn leftq, leftmp
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0
+ pshufd m1, m0, 0x1
+ movd [dstq ], m0
+ movd [dstq+strideq], m1
+ pshufd m2, m0, 0x2
+ lea dstq, [dstq+strideq*2]
+ pshufd m3, m0, 0x3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -2
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [leftq ]
+ punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
+.loop:
+ pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
+ pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ pshuflw m1, m0, 0xaa
+ pshuflw m2, m0, 0xff
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+ inc lineq
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -4
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+strideq ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -8
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16 ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2 ], m1
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
+ punpcklbw m0, m1
+ pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
+ psrldq m0, 2
+ psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+ movd m2, [leftq]
+ punpcklbw m2, m1
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ pshuflw m4, m2, 0xaa
+ pshuflw m3, m2, 0xff
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ movq m0, [aboveq]
+ punpcklbw m2, m1
+ punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+ pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -4
+ punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
+ psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
+ movq m2, [leftq]
+ punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop:
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+ punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m3
+ movq [dstq ], m4
+ movhps [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m2, 4
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
+ pxor m1, m1
+ mova m2, [aboveq-16];
+ mova m0, [aboveq] ; t1 t2 ... t16 [byte]
+ punpckhbw m2, m1 ; [127:112] tl [word]
+ punpckhbw m4, m0, m1
+ punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
+ DEFINE_ARGS dst, stride, line, left, stride8
+ mov lineq, -8
+ pshufhw m2, m2, 0xff
+ mova m3, [leftq] ; l1 l2 ... l16 [byte]
+ punpckhqdq m2, m2 ; tl repeated 8 times [word]
+ psubw m0, m2
+ psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
+ lea stride8q, [strideq*8]
+.loop:
+ pshuflw m6, m3, 0x0
+ pshuflw m7, m5, 0x0
+ punpcklqdq m6, m6 ; l1 repeated 8 times [word]
+ punpcklqdq m7, m7 ; l8 repeated 8 times [word]
+ paddw m1, m6, m0
+ paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
+ psrldq m5, 2
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m1, m7, m0
+ paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
+ psrldq m3, 2
+ packuswb m1, m7
+ mova [dstq+stride8q], m1
+ inc lineq
+ lea dstq, [dstq+strideq]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ punpcklbw m2, m1
+ punpckhbw m3, m0, m1
+ punpckhbw m5, m4, m1
+ punpcklbw m0, m1
+ punpcklbw m4, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ punpcklqdq m2, m2
+ add leftq, 32
+ psubw m0, m2
+ psubw m3, m2
+ psubw m4, m2
+ psubw m5, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ pxor m1, m1
+ punpcklbw m2, m1
+ pshuflw m7, m2, 0x55
+ pshuflw m2, m2, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m7, m7
+ paddw m6, m2, m3
+ paddw m1, m2, m0
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m6, m2, m5
+ paddw m1, m2, m4
+ packuswb m1, m6
+ mova [dstq+16 ], m1
+ paddw m6, m7, m3
+ paddw m1, m7, m0
+ packuswb m1, m6
+ mova [dstq+strideq ], m1
+ paddw m6, m7, m5
+ paddw m1, m7, m4
+ packuswb m1, m6
+ mova [dstq+strideq+16], m1
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm
new file mode 100644
index 000000000..bc1bb2ff3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm
@@ -0,0 +1,410 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ pshufb m1, m3, [GLOBAL(sh_b23456777)]
+ pshufb m2, m3, [GLOBAL(sh_b12345677)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m3, 1
+ psrldq m4, 1
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; l1, l2, l3, l4
+ movd m1, [aboveq-1] ; tl, t1, t2, t3
+ punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
+ pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+ psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
+ psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1
+ ; A2 B2 A1 B1
+ ; A3 B3 A2 B2
+ ; A4 B4 A3 B3
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
+ pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
+
+ punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+stride3q ], m3
+ psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq*2], m3
+ psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq ], m3
+ psrldq m3, 2 ; A1 B1 C1 D1 ..
+ movd [dstq ], m3
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movq m0, [leftq] ; [0- 7] l1-8 [byte]
+ movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
+ pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
+ pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
+ pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
+ pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
+ psrldq m4, m0, 1 ; t1-7 [word]
+ psrldq m5, m0, 2 ; t2-7 [word]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1
+ ; A2 B2 A1 B1 C1 D1 E1 F1
+ ; A3 B3 A2 B2 A1 B1 C1 D1
+ ; A4 B4 A3 B3 A2 B2 A1 B1
+ ; A5 B5 A4 B4 A3 B3 A2 B2
+ ; A6 B6 A5 B5 A4 B4 A3 B3
+ ; A7 B7 A6 B6 A5 B5 A4 B4
+ ; A8 B8 A7 B7 A6 B6 A5 B5
+ pavgb m6, m1, m2 ; 2-tap avg A8-A1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
+
+ punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
+ palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2 ; A-B2, A-B1, C-H1
+ movq [dstq+strideq ], m0
+ psrldq m0, 2 ; A-H1
+ movq [dstq ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
+ psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
+ movq [dstq+strideq*2], m6
+ psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
+ movq [dstq+strideq ], m6
+ psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
+ movq [dstq ], m6
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+ ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+ ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+ ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+ ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+ ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+ ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+ ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+ ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+ ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+ ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+ ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+ ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+ ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+ ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+ ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+ pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m6, 15
+ palignr m3, m0, m6, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+ pavgb m5, m0 ; A1 - Ag
+
+ punpcklbw m0, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+
+ pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
+
+ pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m2, m1, m6, 14
+ mova [dstq ], m2
+ palignr m2, m1, m6, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m1, m6, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m1, m6, 6
+ mova [dstq ], m2
+ palignr m2, m1, m6, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 2
+ mova [dstq+strideq*2], m2
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+ mova [dstq+stride3q ], m6
+ lea dstq, [dstq+strideq*4]
+
+ palignr m2, m6, m4, 14
+ mova [dstq ], m2
+ palignr m2, m6, m4, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m6, m4, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m6, m4, 6
+ mova [dstq ], m2
+ palignr m2, m6, m4, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 2
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ movu m1, [aboveq+15]
+
+ pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
+
+ palignr m3, m1, m7, 1
+ palignr m5, m1, m7, 2
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
+
+ pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m7, 15
+ palignr m3, m0, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pavgb m5, m0 ; A1 - Ag
+ punpcklbw m6, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+ pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+
+ DEFINE_ARGS dst, stride, stride3, left, line
+ lea stride3q, [strideq*3]
+
+ palignr m5, m2, m1, 14
+ palignr m7, m1, m6, 14
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 12
+ palignr m7, m1, m6, 12
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 10
+ palignr m7, m1, m6, 10
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m2, m1, 8
+ palignr m7, m1, m6, 8
+ mova [dstq+stride3q ], m7
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m2, m1, 6
+ palignr m7, m1, m6, 6
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 4
+ palignr m7, m1, m6, 4
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 2
+ palignr m7, m1, m6, 2
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m6
+ mova [dstq+stride3q+16 ], m1
+ lea dstq, [dstq+strideq*4]
+
+ palignr m5, m1, m6, 14
+ palignr m3, m6, m4, 14
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 12
+ palignr m3, m6, m4, 12
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 10
+ palignr m3, m6, m4, 10
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m1, m6, 8
+ palignr m3, m6, m4, 8
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m1, m6, 6
+ palignr m3, m6, m4, 6
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 4
+ palignr m3, m6, m4, 4
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 2
+ palignr m3, m6, m4, 2
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m4
+ mova [dstq+stride3q+16 ], m6
+ lea dstq, [dstq+strideq*4]
+
+ mova m7, [leftq]
+ mova m3, [leftq+16]
+ palignr m5, m3, m7, 15
+ palignr m0, m3, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
+ pavgb m5, m3 ; Ah -
+ punpcklbw m3, m2, m5 ; A-B8 ... A-B1
+ punpckhbw m2, m5 ; A-B9 ... A-Bg
+ pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
+
+ palignr m7, m6, m4, 14
+ palignr m0, m4, m3, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 12
+ palignr m0, m4, m3, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 10
+ palignr m0, m4, m3, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m6, m4, 8
+ palignr m0, m4, m3, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m6, m4, 6
+ palignr m0, m4, m3, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 4
+ palignr m0, m4, m3, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 2
+ palignr m0, m4, m3, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m4
+ lea dstq, [dstq+strideq*4]
+
+ palignr m7, m4, m3, 14
+ palignr m0, m3, m2, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 12
+ palignr m0, m3, m2, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 10
+ palignr m0, m3, m2, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m4, m3, 8
+ palignr m0, m3, m2, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m4, m3, 6
+ palignr m0, m3, m2, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 4
+ palignr m0, m3, m2, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 2
+ palignr m0, m3, m2, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m3
+
+ RESTORE_GOT
+ RET
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
new file mode 100644
index 000000000..5795a1845
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
@@ -0,0 +1,3631 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+ { \
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+ }
+
+void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i cst = _mm_setr_epi16(
+ (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i input0, input1, input2, input3;
+
+ // Rows
+ input0 = load_input_data(input);
+ input2 = load_input_data(input + 8);
+
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_shufflelo_epi16(input0, 0xd8);
+ input0 = _mm_shufflehi_epi16(input0, 0xd8);
+ input2 = _mm_shufflelo_epi16(input2, 0xd8);
+ input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+ input1 = _mm_unpackhi_epi32(input0, input0);
+ input0 = _mm_unpacklo_epi32(input0, input0);
+ input3 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpacklo_epi32(input2, input2);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input1);
+ input1 = _mm_packs_epi32(input2, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Columns
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_unpacklo_epi32(input2, input2);
+ input1 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpackhi_epi32(input3, input3);
+ input3 = _mm_unpacklo_epi32(input3, input3);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input2);
+ input1 = _mm_packs_epi32(input1, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Final round and shift
+ input2 = _mm_add_epi16(input2, eight);
+ input3 = _mm_add_epi16(input3, eight);
+
+ input2 = _mm_srai_epi16(input2, 4);
+ input3 = _mm_srai_epi16(input3, 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d2 = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, input2);
+ d2 = _mm_add_epi16(d2, input3);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store input0
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store input2
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ // store input3
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ }
+}
+
+void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ if (a == 0) return;
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+void aom_idct4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+
+ array_transpose_4x4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[3], v[2]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[1]);
+ in[1] = _mm_sub_epi16(u[0], u[1]);
+ in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void aom_iadst4_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8], in7;
+
+ array_transpose_4x4(in);
+ in7 = _mm_srli_si128(in[1], 8);
+ in7 = _mm_add_epi16(in7, in[0]);
+ in7 = _mm_sub_epi16(in7, in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(v[3], v[4]);
+ u[2] = v[2];
+ u[3] = _mm_add_epi32(u[0], u[1]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_add_epi32(u[3], v[5]);
+ u[6] = _mm_sub_epi32(u[5], u[4]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
+ res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
+ out4, out5, out6, out7) \
+ { \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
+ stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
+ stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+ out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+ }
+
+void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+ in4 = load_input_data(input + 8 * 4);
+ in5 = load_input_data(input + 8 * 5);
+ in6 = load_input_data(input + 8 * 6);
+ in7 = load_input_data(input + 8 * 7);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from aom_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
+ in6, in7);
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ if (a == 0) return;
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void aom_idct8_sse2(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // 8x8 Transpose is copied from aom_fdct8x8_sse2()
+ TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
+ in1, in2, in3, in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
+ in[4], in[5], in[6], in[7]);
+}
+
+void aom_iadst8_sse2(__m128i *in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // transpose
+ array_transpose_8x8(in, in);
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // Rows. Load 4-row input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+ // Stage1
+ {
+ const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+ const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+ }
+
+ // Stage2
+ {
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+ const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+ tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+ tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+ tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+ tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+ stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+ tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
+ stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
+ stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
+ stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ \
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
+ stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ }
+
+#define IDCT16_10 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
+ stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
+ stp1_12_0) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
+ \
+ stp1_9 = stp1_8_0; \
+ stp1_10 = stp1_11; \
+ \
+ stp1_13 = stp1_12_0; \
+ stp1_14 = stp1_15; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
+ stp2_5 = stp2_4; \
+ stp2_6 = stp2_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_2 = stp1_1; \
+ stp1_3 = stp1_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ }
+
+void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[16], l[16], r[16], *curr1;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ curr1 = l;
+ for (i = 0; i < 2; i++) {
+ // 1-D idct
+
+ // Load input data.
+ in[0] = load_input_data(input);
+ in[8] = load_input_data(input + 8 * 1);
+ in[1] = load_input_data(input + 8 * 2);
+ in[9] = load_input_data(input + 8 * 3);
+ in[2] = load_input_data(input + 8 * 4);
+ in[10] = load_input_data(input + 8 * 5);
+ in[3] = load_input_data(input + 8 * 6);
+ in[11] = load_input_data(input + 8 * 7);
+ in[4] = load_input_data(input + 8 * 8);
+ in[12] = load_input_data(input + 8 * 9);
+ in[5] = load_input_data(input + 8 * 10);
+ in[13] = load_input_data(input + 8 * 11);
+ in[6] = load_input_data(input + 8 * 12);
+ in[14] = load_input_data(input + 8 * 13);
+ in[7] = load_input_data(input + 8 * 14);
+ in[15] = load_input_data(input + 8 * 15);
+
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+
+ IDCT16
+
+ // Stage7
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ curr1 = r;
+ input += 128;
+ }
+ for (i = 0; i < 2; i++) {
+ int j;
+ // 1-D idct
+ array_transpose_8x8(l + i * 8, in);
+ array_transpose_8x8(r + i * 8, in + 8);
+
+ IDCT16
+
+ // 2-D
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ if (a == 0) return;
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 16; ++i) {
+ RECON_AND_STORE(dest + 0, dc_value);
+ RECON_AND_STORE(dest + 8, dc_value);
+ dest += stride;
+ }
+}
+
+void iadst16_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void idct16_8col(__m128i *in) {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i v[16], u[16], s[16], t[16];
+
+ // stage 1
+ s[0] = in[0];
+ s[1] = in[8];
+ s[2] = in[4];
+ s[3] = in[12];
+ s[4] = in[2];
+ s[5] = in[10];
+ s[6] = in[6];
+ s[7] = in[14];
+ s[8] = in[1];
+ s[9] = in[9];
+ s[10] = in[5];
+ s[11] = in[13];
+ s[12] = in[3];
+ s[13] = in[11];
+ s[14] = in[7];
+ s[15] = in[15];
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+ u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+ u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+ u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[8] = _mm_packs_epi32(u[0], u[1]);
+ s[15] = _mm_packs_epi32(u[2], u[3]);
+ s[9] = _mm_packs_epi32(u[4], u[5]);
+ s[14] = _mm_packs_epi32(u[6], u[7]);
+ s[10] = _mm_packs_epi32(u[8], u[9]);
+ s[13] = _mm_packs_epi32(u[10], u[11]);
+ s[11] = _mm_packs_epi32(u[12], u[13]);
+ s[12] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ t[0] = s[0];
+ t[1] = s[1];
+ t[2] = s[2];
+ t[3] = s[3];
+ u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[4] = _mm_packs_epi32(u[0], u[1]);
+ t[7] = _mm_packs_epi32(u[2], u[3]);
+ t[5] = _mm_packs_epi32(u[4], u[5]);
+ t[6] = _mm_packs_epi32(u[6], u[7]);
+ t[8] = _mm_add_epi16(s[8], s[9]);
+ t[9] = _mm_sub_epi16(s[8], s[9]);
+ t[10] = _mm_sub_epi16(s[11], s[10]);
+ t[11] = _mm_add_epi16(s[10], s[11]);
+ t[12] = _mm_add_epi16(s[12], s[13]);
+ t[13] = _mm_sub_epi16(s[12], s[13]);
+ t[14] = _mm_sub_epi16(s[15], s[14]);
+ t[15] = _mm_add_epi16(s[14], s[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+ u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+ u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+ u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+ u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+ u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_add_epi16(t[4], t[5]);
+ s[5] = _mm_sub_epi16(t[4], t[5]);
+ s[6] = _mm_sub_epi16(t[7], t[6]);
+ s[7] = _mm_add_epi16(t[6], t[7]);
+ s[8] = t[8];
+ s[15] = t[15];
+ s[9] = _mm_packs_epi32(u[8], u[9]);
+ s[14] = _mm_packs_epi32(u[10], u[11]);
+ s[10] = _mm_packs_epi32(u[12], u[13]);
+ s[13] = _mm_packs_epi32(u[14], u[15]);
+ s[11] = t[11];
+ s[12] = t[12];
+
+ // stage 5
+ t[0] = _mm_add_epi16(s[0], s[3]);
+ t[1] = _mm_add_epi16(s[1], s[2]);
+ t[2] = _mm_sub_epi16(s[1], s[2]);
+ t[3] = _mm_sub_epi16(s[0], s[3]);
+ t[4] = s[4];
+ t[7] = s[7];
+
+ u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ t[5] = _mm_packs_epi32(u[0], u[1]);
+ t[6] = _mm_packs_epi32(u[2], u[3]);
+
+ t[8] = _mm_add_epi16(s[8], s[11]);
+ t[9] = _mm_add_epi16(s[9], s[10]);
+ t[10] = _mm_sub_epi16(s[9], s[10]);
+ t[11] = _mm_sub_epi16(s[8], s[11]);
+ t[12] = _mm_sub_epi16(s[15], s[12]);
+ t[13] = _mm_sub_epi16(s[14], s[13]);
+ t[14] = _mm_add_epi16(s[13], s[14]);
+ t[15] = _mm_add_epi16(s[12], s[15]);
+
+ // stage 6
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+
+ // stage 7
+ in[0] = _mm_add_epi16(s[0], s[15]);
+ in[1] = _mm_add_epi16(s[1], s[14]);
+ in[2] = _mm_add_epi16(s[2], s[13]);
+ in[3] = _mm_add_epi16(s[3], s[12]);
+ in[4] = _mm_add_epi16(s[4], s[11]);
+ in[5] = _mm_add_epi16(s[5], s[10]);
+ in[6] = _mm_add_epi16(s[6], s[9]);
+ in[7] = _mm_add_epi16(s[7], s[8]);
+ in[8] = _mm_sub_epi16(s[7], s[8]);
+ in[9] = _mm_sub_epi16(s[6], s[9]);
+ in[10] = _mm_sub_epi16(s[5], s[10]);
+ in[11] = _mm_sub_epi16(s[4], s[11]);
+ in[12] = _mm_sub_epi16(s[3], s[12]);
+ in[13] = _mm_sub_epi16(s[2], s[13]);
+ in[14] = _mm_sub_epi16(s[1], s[14]);
+ in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ idct16_8col(in0);
+ idct16_8col(in1);
+}
+
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ iadst16_8col(in0);
+ iadst16_8col(in1);
+}
+
+void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i in[16], l[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
+ stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
+ stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8 * 2);
+ in[2] = load_input_data(input + 8 * 4);
+ in[3] = load_input_data(input + 8 * 6);
+
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+ // Stage2
+ {
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+ tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+ tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+ tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp2_8 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+ tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+ stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+ tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+ tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+ tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+ stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+ stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+ stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+ }
+
+ // Stage5 and Stage6
+ {
+ tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+ tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+ tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+ tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+ stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+ stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+ stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+ stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+ }
+
+ // Stage6
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+ tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+ tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+ tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+ tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+ stp2_10 = _mm_packs_epi32(tmp0, zero);
+ stp2_13 = _mm_packs_epi32(tmp2, zero);
+ stp2_11 = _mm_packs_epi32(tmp4, zero);
+ stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+ tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+ tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+ tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+ tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+ stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+ stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+ stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+ stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+ stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage7. Left 8x16 only.
+ l[0] = _mm_add_epi16(stp2_0, stp1_15);
+ l[1] = _mm_add_epi16(stp2_1, stp1_14);
+ l[2] = _mm_add_epi16(stp2_2, stp2_13);
+ l[3] = _mm_add_epi16(stp2_3, stp2_12);
+ l[4] = _mm_add_epi16(stp2_4, stp2_11);
+ l[5] = _mm_add_epi16(stp2_5, stp2_10);
+ l[6] = _mm_add_epi16(stp2_6, stp1_9);
+ l[7] = _mm_add_epi16(stp2_7, stp1_8);
+ l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 2; i++) {
+ int j;
+ array_transpose_4X8(l + 8 * i, in);
+
+ IDCT16_10
+
+ // Stage7
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+ { \
+ reg = load_input_data(input); \
+ input += 8; \
+ }
+
+#define IDCT32_34 \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+ \
+ const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+ \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
+ stp1_31); \
+ MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
+ stp1_28); \
+ MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
+ stp1_27); \
+ MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
+ stp1_24); \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
+ stp2_15); \
+ MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
+ stp2_12); \
+ \
+ stp2_16 = stp1_16; \
+ stp2_19 = stp1_19; \
+ \
+ stp2_20 = stp1_20; \
+ stp2_23 = stp1_23; \
+ \
+ stp2_24 = stp1_24; \
+ stp2_27 = stp1_27; \
+ \
+ stp2_28 = stp1_28; \
+ stp2_31 = stp1_31; \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
+ stp1_7); \
+ \
+ stp1_8 = stp2_8; \
+ stp1_11 = stp2_11; \
+ stp1_12 = stp2_12; \
+ stp1_15 = stp2_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
+ stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+ stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
+ stp2_1); \
+ \
+ stp2_4 = stp1_4; \
+ stp2_5 = stp1_4; \
+ stp2_6 = stp1_7; \
+ stp2_7 = stp1_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = stp2_0; \
+ stp1_1 = stp2_1; \
+ stp1_2 = stp2_1; \
+ stp1_3 = stp2_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
+ stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+ } \
+ \
+ /* Stage7 */ \
+ { \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
+ stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ }
+
+#define IDCT32(in0, in1) \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \
+ const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
+ stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
+ stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
+ stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+ stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
+ stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
+ stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
+ stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
+ stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+ } \
+ \
+ /* Stage7 */ \
+ { \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
+ stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
+ stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+ }
+
+// Only upper-left 8x8 has non-zero coeff
+void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[32];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 32);
+ in[2] = load_input_data(input + 64);
+ in[3] = load_input_data(input + 96);
+ in[4] = load_input_data(input + 128);
+ in[5] = load_input_data(input + 160);
+ in[6] = load_input_data(input + 192);
+ in[7] = load_input_data(input + 224);
+
+ for (i = 8; i < 32; ++i) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ array_transpose_8x8(in, in);
+ // TODO(hkuang): Following transposes are unnecessary. But remove them will
+ // lead to performance drop on some devices.
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32_34
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ for (i = 0; i < 4; i++) {
+ int j;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ IDCT32_34
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[128], zero_idx[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ for (i = 0; i < 4; i++) {
+ i32 = (i << 5);
+ // First 1-D idct
+ // Load input data.
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
+
+ // checking if all entries are zero
+ zero_idx[0] = _mm_or_si128(in[0], in[1]);
+ zero_idx[1] = _mm_or_si128(in[2], in[3]);
+ zero_idx[2] = _mm_or_si128(in[4], in[5]);
+ zero_idx[3] = _mm_or_si128(in[6], in[7]);
+ zero_idx[4] = _mm_or_si128(in[8], in[9]);
+ zero_idx[5] = _mm_or_si128(in[10], in[11]);
+ zero_idx[6] = _mm_or_si128(in[12], in[13]);
+ zero_idx[7] = _mm_or_si128(in[14], in[15]);
+ zero_idx[8] = _mm_or_si128(in[16], in[17]);
+ zero_idx[9] = _mm_or_si128(in[18], in[19]);
+ zero_idx[10] = _mm_or_si128(in[20], in[21]);
+ zero_idx[11] = _mm_or_si128(in[22], in[23]);
+ zero_idx[12] = _mm_or_si128(in[24], in[25]);
+ zero_idx[13] = _mm_or_si128(in[26], in[27]);
+ zero_idx[14] = _mm_or_si128(in[28], in[29]);
+ zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ }
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32(in, in + 16)
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ }
+ for (i = 0; i < 4; i++) {
+ // Second 1-D idct
+ j = i << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ IDCT32(in, in + 16)
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, j;
+
+ a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ if (a == 0) return;
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (j = 0; j < 32; ++j) {
+ RECON_AND_STORE(dest + 0 + j * stride, dc_value);
+ RECON_AND_STORE(dest + 8 + j * stride, dc_value);
+ RECON_AND_STORE(dest + 16 + j * stride, dc_value);
+ RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+ }
+}
+
+// Apply a 32-element IDCT to 8 columns. This does not do any transposition
+// of its input - the caller is expected to have done that.
+// The input buffers are the top and bottom halves of an 8x32 block.
+void idct32_8col(__m128i *in0, __m128i *in1) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ IDCT32(in0, in1)
+
+ // 2_D: Calculate the results and store them to destination.
+ in0[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in0[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in0[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in0[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in0[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in0[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in0[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in0[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in0[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in0[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in0[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in0[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in0[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in0[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in0[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in0[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
+ in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
+ in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
+ in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
+ in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
+ in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
+ in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
+ in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
+ in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
+ in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
+ in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
+ in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
+ in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
+ in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
+ in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
+ in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+ __m128i ubounded, retval;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ retval = _mm_andnot_si128(ubounded, value);
+ ubounded = _mm_and_si128(ubounded, max);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+ return retval;
+}
+
+void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ __m128i inptr[4];
+ __m128i sign_bits[2];
+ __m128i temp_mm, min_input, max_input;
+ int test;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int optimised_cols = 0;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(12043);
+ const __m128i min = _mm_set1_epi16(-12043);
+ // Load input into __m128i
+ inptr[0] = _mm_loadu_si128((const __m128i *)input);
+ inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+ inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+ inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+ // Pack to 16 bits
+ inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+ inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (!test) {
+ // Do the row transform
+ aom_idct4_sse2(inptr);
+
+ // Check the min & max values
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (test) {
+ array_transpose_4x4(inptr);
+ sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+ sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+ inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+ inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+ inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+ inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ aom_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+ }
+
+ if (optimised_cols) {
+ aom_idct4_sse2(inptr);
+
+ // Final round and shift
+ inptr[0] = _mm_add_epi16(inptr[0], eight);
+ inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+ inptr[0] = _mm_srai_epi16(inptr[0], 4);
+ inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[4], temp_out[4];
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ aom_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+ }
+}
+
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
new file mode 100644
index 000000000..95d246c3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
+#define AOM_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/inv_txfm.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_4x4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// highbitdepth enabled
+static INLINE __m128i load_input_data(const tran_low_t *data) {
+#if CONFIG_HIGHBITDEPTH
+ return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+ data[6], data[7]);
+#else
+ return _mm_load_si128((const __m128i *)data);
+#endif
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
+ in[0] = load_input_data(input + 0 * 16);
+ in[1] = load_input_data(input + 1 * 16);
+ in[2] = load_input_data(input + 2 * 16);
+ in[3] = load_input_data(input + 3 * 16);
+ in[4] = load_input_data(input + 4 * 16);
+ in[5] = load_input_data(input + 5 * 16);
+ in[6] = load_input_data(input + 6 * 16);
+ in[7] = load_input_data(input + 7 * 16);
+
+ in[8] = load_input_data(input + 8 * 16);
+ in[9] = load_input_data(input + 9 * 16);
+ in[10] = load_input_data(input + 10 * 16);
+ in[11] = load_input_data(input + 11 * 16);
+ in[12] = load_input_data(input + 12 * 16);
+ in[13] = load_input_data(input + 13 * 16);
+ in[14] = load_input_data(input + 14 * 16);
+ in[15] = load_input_data(input + 15 * 16);
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
+ }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+ RECON_AND_STORE(dest + 8 * stride, in[8]);
+ RECON_AND_STORE(dest + 9 * stride, in[9]);
+ RECON_AND_STORE(dest + 10 * stride, in[10]);
+ RECON_AND_STORE(dest + 11 * stride, in[11]);
+ RECON_AND_STORE(dest + 12 * stride, in[12]);
+ RECON_AND_STORE(dest + 13 * stride, in[13]);
+ RECON_AND_STORE(dest + 14 * stride, in[14]);
+ RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ }
+
+void iadst16_8col(__m128i *in);
+void idct16_8col(__m128i *in);
+void aom_idct4_sse2(__m128i *in);
+void aom_idct8_sse2(__m128i *in);
+void aom_idct16_sse2(__m128i *in0, __m128i *in1);
+void aom_iadst4_sse2(__m128i *in);
+void aom_iadst8_sse2(__m128i *in);
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
+void idct32_8col(__m128i *in0, __m128i *in1);
+
+#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 000000000..9d006797b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,1333 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+ in4 = load_input_data(input + 8 * 4);
+ in5 = load_input_data(input + 8 * 5);
+ in6 = load_input_data(input + 8 * 6);
+ in7 = load_input_data(input + 8 * 7);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ {
+ /* Stage1 */
+ {
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
+
+ {
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp1 = _mm_madd_epi16(hi_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp3 = _mm_madd_epi16(hi_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp5 = _mm_madd_epi16(hi_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+ tmp7 = _mm_madd_epi16(hi_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+ tmp4 = _mm_srai_epi32(tmp4, 14);
+ tmp5 = _mm_srai_epi32(tmp5, 14);
+ tmp6 = _mm_srai_epi32(tmp6, 14);
+ tmp7 = _mm_srai_epi32(tmp7, 14);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_7 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp5);
+ stp1_6 = _mm_packs_epi32(tmp6, tmp7);
+ }
+ }
+
+ /* Stage2 */
+ {
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
+
+ {
+ tmp0 = _mm_unpacklo_epi16(in0, in4);
+ tmp1 = _mm_unpackhi_epi16(in0, in4);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_1);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp2, tmp3);
+ stp2_1 = _mm_packs_epi32(tmp4, tmp5);
+
+ tmp0 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp1 = _mm_madd_epi16(hi_26, stg2_2);
+ tmp2 = _mm_madd_epi16(lo_26, stg2_3);
+ tmp3 = _mm_madd_epi16(hi_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+
+ stp2_2 = _mm_packs_epi32(tmp0, tmp1);
+ stp2_3 = _mm_packs_epi32(tmp2, tmp3);
+ }
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ }
+
+ /* Stage3 */
+ {
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_0);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_6 = _mm_packs_epi32(tmp4, tmp5);
+ }
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+ }
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+ const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+
+ // Rows. Load 4-row input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+
+ // Stage1
+ tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
+ tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
+ tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
+ tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
+
+ stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
+ stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
+
+ // Stage2
+ tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+
+ tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
+ tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
+ stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
+
+ tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+ tmp1 = _mm_madd_epi16(tmp0, stg3_0);
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp1, tmp2);
+
+ // Stage3
+ tmp2 = _mm_add_epi16(stp2_0, stp2_2);
+ tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
+ stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
+
+ // Stage4
+ tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ /* Stage1 */
+ stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
+ stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
+ stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
+ stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
+
+ /* Stage2 */
+ stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
+
+ stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
+ stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ /* Stage3 */
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
+ do { \
+ tmp0 = _mm_madd_epi16(x0, co0); \
+ tmp1 = _mm_madd_epi16(x1, co0); \
+ tmp2 = _mm_madd_epi16(x0, co1); \
+ tmp3 = _mm_madd_epi16(x1, co1); \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ } while (0)
+
+static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
+ const __m128i *c0, const __m128i *c1, __m128i *y0,
+ __m128i *y1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *y0 = _mm_packs_epi32(tmp0, tmp1);
+ *y1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+ const __m128i *c1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *x0 = _mm_packs_epi32(tmp0, tmp1);
+ *x1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i x0, x1, x4, x5, x6, x7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+
+ // phase 1
+
+ // 0, 15
+ u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15
+ u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12
+ v15 = _mm_add_epi16(u2, u3);
+ // in[0], in[4]
+ x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0]
+ x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7]
+ v0 = _mm_add_epi16(x0, x7); // stp2_0
+ stp1[0] = _mm_add_epi16(v0, v15);
+ stp1[15] = _mm_sub_epi16(v0, v15);
+
+ // in[2], in[6]
+ u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8
+ u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11
+ butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14
+ butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13
+
+ v8 = _mm_add_epi16(u0, u1);
+ v9 = _mm_add_epi16(u4, u6);
+ v10 = _mm_sub_epi16(u4, u6);
+ v11 = _mm_sub_epi16(u0, u1);
+ v12 = _mm_sub_epi16(u2, u3);
+ v13 = _mm_sub_epi16(u5, u7);
+ v14 = _mm_add_epi16(u5, u7);
+
+ butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
+ butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
+
+ // 1, 14
+ x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
+ // stp1[2] = stp1[0], stp1[3] = stp1[1]
+ x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4]
+ butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
+ v1 = _mm_add_epi16(x1, x6); // stp2_1
+ v2 = _mm_add_epi16(x0, x5); // stp2_2
+ stp1[1] = _mm_add_epi16(v1, v14);
+ stp1[14] = _mm_sub_epi16(v1, v14);
+
+ stp1[2] = _mm_add_epi16(v2, v13);
+ stp1[13] = _mm_sub_epi16(v2, v13);
+
+ v3 = _mm_add_epi16(x1, x4); // stp2_3
+ v4 = _mm_sub_epi16(x1, x4); // stp2_4
+
+ v5 = _mm_sub_epi16(x0, x5); // stp2_5
+
+ v6 = _mm_sub_epi16(x1, x6); // stp2_6
+ v7 = _mm_sub_epi16(x0, x7); // stp2_7
+ stp1[3] = _mm_add_epi16(v3, v12);
+ stp1[12] = _mm_sub_epi16(v3, v12);
+
+ stp1[6] = _mm_add_epi16(v6, v9);
+ stp1[9] = _mm_sub_epi16(v6, v9);
+
+ stp1[7] = _mm_add_epi16(v7, v8);
+ stp1[8] = _mm_sub_epi16(v7, v8);
+
+ stp1[4] = _mm_add_epi16(v4, v11);
+ stp1[11] = _mm_sub_epi16(v4, v11);
+
+ stp1[5] = _mm_add_epi16(v5, v10);
+ stp1[10] = _mm_sub_epi16(v5, v10);
+}
+
+static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ v16 = _mm_mulhrs_epi16(in[1], stk1_0);
+ v31 = _mm_mulhrs_epi16(in[1], stk1_1);
+
+ v19 = _mm_mulhrs_epi16(in[7], stk1_6);
+ v28 = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ v20 = _mm_mulhrs_epi16(in[5], stk1_8);
+ v27 = _mm_mulhrs_epi16(in[5], stk1_9);
+
+ v23 = _mm_mulhrs_epi16(in[3], stk1_14);
+ v24 = _mm_mulhrs_epi16(in[3], stk1_15);
+
+ butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
+ butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
+ butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
+ butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+ u24 = _mm_add_epi16(v24, v27);
+ u27 = _mm_sub_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u28 = _mm_sub_epi16(v31, v28);
+ u31 = _mm_add_epi16(v28, v31);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+
+ stp1[16] = _mm_add_epi16(u16, u23);
+ stp1[23] = _mm_sub_epi16(u16, u23);
+
+ stp1[17] = _mm_add_epi16(u17, u22);
+ stp1[22] = _mm_sub_epi16(u17, u22);
+
+ stp1[18] = _mm_add_epi16(u18, u21);
+ stp1[21] = _mm_sub_epi16(u18, u21);
+
+ stp1[19] = _mm_add_epi16(u19, u20);
+ stp1[20] = _mm_sub_epi16(u19, u20);
+
+ stp1[24] = _mm_sub_epi16(u31, u24);
+ stp1[31] = _mm_add_epi16(u24, u31);
+
+ stp1[25] = _mm_sub_epi16(u30, u25);
+ stp1[30] = _mm_add_epi16(u25, u30);
+
+ stp1[26] = _mm_sub_epi16(u29, u26);
+ stp1[29] = _mm_add_epi16(u26, u29);
+
+ stp1[27] = _mm_sub_epi16(u28, u27);
+ stp1[28] = _mm_add_epi16(u27, u28);
+
+ butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i in[32], col[32];
+ __m128i stp1[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 32);
+ in[2] = load_input_data(input + 64);
+ in[3] = load_input_data(input + 96);
+ in[4] = load_input_data(input + 128);
+ in[5] = load_input_data(input + 160);
+ in[6] = load_input_data(input + 192);
+ in[7] = load_input_data(input + 224);
+
+ array_transpose_8x8(in, in);
+ idct32_34_first_half(in, stp1);
+ idct32_34_second_half(in, stp1);
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ add_sub_butterfly(stp1, col, 32);
+ for (i = 0; i < 4; i++) {
+ int j;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ idct32_34_first_half(in, stp1);
+ idct32_34_second_half(in, stp1);
+
+ // 2_D: Calculate the results and store them to destination.
+ add_sub_butterfly(stp1, in, 32);
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// in0[16] represents the left 8x16 block
+// in1[16] represents the right 8x16 block
+static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
+ __m128i *in1) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ in0[i] = load_input_data(input);
+ in1[i] = load_input_data(input + 8);
+ input += 32;
+ }
+}
+
+static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
+ __m128i *out1) {
+ array_transpose_8x8(in0, out0);
+ array_transpose_8x8(&in0[8], out1);
+ array_transpose_8x8(in1, &out0[8]);
+ array_transpose_8x8(&in1[8], &out1[8]);
+}
+
+// Group the coefficient calculation into smaller functions
+// to prevent stack spillover:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ {
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+ u0 = _mm_mulhrs_epi16(in[0], stk4_0);
+ u2 = _mm_mulhrs_epi16(in[8], stk4_2);
+ u3 = _mm_mulhrs_epi16(in[8], stk4_3);
+ u1 = u0;
+ }
+
+ v0 = _mm_add_epi16(u0, u3);
+ v1 = _mm_add_epi16(u1, u2);
+ v2 = _mm_sub_epi16(u1, u2);
+ v3 = _mm_sub_epi16(u0, u3);
+
+ {
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+ u4 = _mm_mulhrs_epi16(in[4], stk3_0);
+ u7 = _mm_mulhrs_epi16(in[4], stk3_1);
+ u5 = _mm_mulhrs_epi16(in[12], stk3_2);
+ u6 = _mm_mulhrs_epi16(in[12], stk3_3);
+ }
+
+ v4 = _mm_add_epi16(u4, u5);
+ v5 = _mm_sub_epi16(u4, u5);
+ v6 = _mm_sub_epi16(u7, u6);
+ v7 = _mm_add_epi16(u7, u6);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+ }
+
+ out[0] = _mm_add_epi16(v0, v7);
+ out[1] = _mm_add_epi16(v1, v6);
+ out[2] = _mm_add_epi16(v2, v5);
+ out[3] = _mm_add_epi16(v3, v4);
+ out[4] = _mm_sub_epi16(v3, v4);
+ out[5] = _mm_sub_epi16(v2, v5);
+ out[6] = _mm_sub_epi16(v1, v6);
+ out[7] = _mm_sub_epi16(v0, v7);
+}
+
+static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v8, v9, v10, v11, v12, v13, v14, v15;
+
+ {
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
+ const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
+ const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
+ const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+ u8 = _mm_mulhrs_epi16(in[2], stk2_0);
+ u15 = _mm_mulhrs_epi16(in[2], stk2_1);
+ u9 = _mm_mulhrs_epi16(in[14], stk2_2);
+ u14 = _mm_mulhrs_epi16(in[14], stk2_3);
+ u10 = _mm_mulhrs_epi16(in[10], stk2_4);
+ u13 = _mm_mulhrs_epi16(in[10], stk2_5);
+ u11 = _mm_mulhrs_epi16(in[6], stk2_6);
+ u12 = _mm_mulhrs_epi16(in[6], stk2_7);
+ }
+
+ v8 = _mm_add_epi16(u8, u9);
+ v9 = _mm_sub_epi16(u8, u9);
+ v10 = _mm_sub_epi16(u11, u10);
+ v11 = _mm_add_epi16(u11, u10);
+ v12 = _mm_add_epi16(u12, u13);
+ v13 = _mm_sub_epi16(u12, u13);
+ v14 = _mm_sub_epi16(u15, u14);
+ v15 = _mm_add_epi16(u15, u14);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(v8, v11);
+ out[1] = _mm_add_epi16(v9, v10);
+ out[2] = _mm_sub_epi16(v9, v10);
+ out[3] = _mm_sub_epi16(v8, v11);
+ out[4] = _mm_sub_epi16(v15, v12);
+ out[5] = _mm_sub_epi16(v14, v13);
+ out[6] = _mm_add_epi16(v14, v13);
+ out[7] = _mm_add_epi16(v15, v12);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+ }
+}
+
+// 8x32 block even indexed 8 inputs of in[16],
+// output first half 16 to out[32]
+static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_8x32_135_quarter_1(in, temp);
+ idct32_8x32_135_quarter_2(in, &temp[8]);
+ add_sub_butterfly(temp, out, 16);
+}
+
+// 8x32 block odd indexed 8 inputs of in[16],
+// output second half 16 to out[32]
+static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ {
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
+ const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
+
+ const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
+ const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
+ const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
+
+ const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
+ const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+ u16 = _mm_mulhrs_epi16(in[1], stk1_0);
+ u31 = _mm_mulhrs_epi16(in[1], stk1_1);
+ u17 = _mm_mulhrs_epi16(in[15], stk1_2);
+ u30 = _mm_mulhrs_epi16(in[15], stk1_3);
+
+ u18 = _mm_mulhrs_epi16(in[9], stk1_4);
+ u29 = _mm_mulhrs_epi16(in[9], stk1_5);
+ u19 = _mm_mulhrs_epi16(in[7], stk1_6);
+ u28 = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ u20 = _mm_mulhrs_epi16(in[5], stk1_8);
+ u27 = _mm_mulhrs_epi16(in[5], stk1_9);
+ u21 = _mm_mulhrs_epi16(in[11], stk1_10);
+ u26 = _mm_mulhrs_epi16(in[11], stk1_11);
+
+ u22 = _mm_mulhrs_epi16(in[13], stk1_12);
+ u25 = _mm_mulhrs_epi16(in[13], stk1_13);
+ u23 = _mm_mulhrs_epi16(in[3], stk1_14);
+ u24 = _mm_mulhrs_epi16(in[3], stk1_15);
+ }
+
+ v16 = _mm_add_epi16(u16, u17);
+ v17 = _mm_sub_epi16(u16, u17);
+ v18 = _mm_sub_epi16(u19, u18);
+ v19 = _mm_add_epi16(u19, u18);
+
+ v20 = _mm_add_epi16(u20, u21);
+ v21 = _mm_sub_epi16(u20, u21);
+ v22 = _mm_sub_epi16(u23, u22);
+ v23 = _mm_add_epi16(u23, u22);
+
+ v24 = _mm_add_epi16(u24, u25);
+ v25 = _mm_sub_epi16(u24, u25);
+ v26 = _mm_sub_epi16(u27, u26);
+ v27 = _mm_add_epi16(u27, u26);
+
+ v28 = _mm_add_epi16(u28, u29);
+ v29 = _mm_sub_epi16(u28, u29);
+ v30 = _mm_sub_epi16(u31, u30);
+ v31 = _mm_add_epi16(u31, u30);
+
+ {
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+ }
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+
+ u24 = _mm_add_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u27 = _mm_sub_epi16(v24, v27);
+ u28 = _mm_sub_epi16(v31, v28);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+ u31 = _mm_add_epi16(v28, v31);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(u16, u23);
+ out[1] = _mm_add_epi16(u17, u22);
+ out[2] = _mm_add_epi16(u18, u21);
+ out[3] = _mm_add_epi16(u19, u20);
+ v20 = _mm_sub_epi16(u19, u20);
+ v21 = _mm_sub_epi16(u18, u21);
+ v22 = _mm_sub_epi16(u17, u22);
+ v23 = _mm_sub_epi16(u16, u23);
+
+ v24 = _mm_sub_epi16(u31, u24);
+ v25 = _mm_sub_epi16(u30, u25);
+ v26 = _mm_sub_epi16(u29, u26);
+ v27 = _mm_sub_epi16(u28, u27);
+ out[12] = _mm_add_epi16(u27, u28);
+ out[13] = _mm_add_epi16(u26, u29);
+ out[14] = _mm_add_epi16(u25, u30);
+ out[15] = _mm_add_epi16(u24, u31);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
+ butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
+ butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
+ butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
+ }
+}
+
+// 8x16 block, input __m128i in[16], output __m128i in[32]
+static void idct32_8x32_135(__m128i *in /*in[32]*/) {
+ __m128i out[32];
+ idct32_8x32_quarter_1_2(in, out);
+ idct32_8x32_quarter_3_4(in, &out[16]);
+ add_sub_butterfly(out, in, 32);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ RECON_AND_STORE(dst, in[j]);
+ dst += stride;
+ RECON_AND_STORE(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
+ int stride) {
+ store_buffer_8x32(in0, dest, stride);
+ store_buffer_8x32(in1, dest + 8, stride);
+}
+
+static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
+ idct32_8x32_135(col0);
+ idct32_8x32_135(col1);
+}
+
+typedef enum { left_16, right_16 } ColsIndicator;
+
+static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
+ ColsIndicator cols) {
+ switch (cols) {
+ case left_16: {
+ int i;
+ array_transpose_16x16(in0, in1);
+ for (i = 0; i < 16; ++i) {
+ store[i] = in0[16 + i];
+ store[16 + i] = in1[16 + i];
+ }
+ break;
+ }
+ case right_16: {
+ array_transpose_16x16_2(store, &store[16], in0, in1);
+ break;
+ }
+ default: { assert(0); }
+ }
+}
+
+// Only upper-left 16x16 has non-zero coeff
+void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ // Each array represents an 8x32 block
+ __m128i col0[32], col1[32];
+ // This array represents a 16x16 block
+ __m128i temp[32];
+
+ // Load input data. Only need to load the top left 16x16 block.
+ load_buffer_16x16(input, col0, col1);
+
+ // columns
+ array_transpose_16x16(col0, col1);
+ idct32_135(col0, col1);
+
+ // rows
+ transpose_and_copy_16x16(col0, col1, temp, left_16);
+ idct32_135(col0, col1);
+ recon_and_store(col0, col1, dest, stride);
+
+ transpose_and_copy_16x16(col0, col1, temp, right_16);
+ idct32_135(col0, col1);
+ recon_and_store(col0, col1, dest + 16, stride);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i in[32]
+static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
+ __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
+
+ {
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
+ butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
+ }
+
+ v8 = _mm_add_epi16(u8, u9);
+ v9 = _mm_sub_epi16(u8, u9);
+ v14 = _mm_sub_epi16(u15, u14);
+ v15 = _mm_add_epi16(u15, u14);
+
+ {
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
+ butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
+ }
+
+ v10 = _mm_sub_epi16(u11, u10);
+ v11 = _mm_add_epi16(u11, u10);
+ v12 = _mm_add_epi16(u12, u13);
+ v13 = _mm_sub_epi16(u12, u13);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(v8, v11);
+ out[1] = _mm_add_epi16(v9, v10);
+ out[6] = _mm_add_epi16(v14, v13);
+ out[7] = _mm_add_epi16(v15, v12);
+
+ out[2] = _mm_sub_epi16(v9, v10);
+ out[3] = _mm_sub_epi16(v8, v11);
+ out[4] = _mm_sub_epi16(v15, v12);
+ out[5] = _mm_sub_epi16(v14, v13);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i in[32]
+static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
+
+ {
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
+ butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
+ }
+
+ v4 = _mm_add_epi16(u4, u5);
+ v5 = _mm_sub_epi16(u4, u5);
+ v6 = _mm_sub_epi16(u7, u6);
+ v7 = _mm_add_epi16(u7, u6);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+
+ butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
+ butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
+ }
+
+ v0 = _mm_add_epi16(u0, u3);
+ v1 = _mm_add_epi16(u1, u2);
+ v2 = _mm_sub_epi16(u1, u2);
+ v3 = _mm_sub_epi16(u0, u3);
+
+ out[0] = _mm_add_epi16(v0, v7);
+ out[1] = _mm_add_epi16(v1, v6);
+ out[2] = _mm_add_epi16(v2, v5);
+ out[3] = _mm_add_epi16(v3, v4);
+ out[4] = _mm_sub_epi16(v3, v4);
+ out[5] = _mm_sub_epi16(v2, v5);
+ out[6] = _mm_sub_epi16(v1, v6);
+ out[7] = _mm_sub_epi16(v0, v7);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i in[32]
+// We avoid hide an offset, 16, inside this function. So we output 0-15 into
+// array out[16]
+static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ {
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
+ butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
+ butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
+ butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
+
+ butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
+ butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
+
+ butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
+ butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
+ }
+
+ v16 = _mm_add_epi16(u16, u17);
+ v17 = _mm_sub_epi16(u16, u17);
+ v18 = _mm_sub_epi16(u19, u18);
+ v19 = _mm_add_epi16(u19, u18);
+
+ v20 = _mm_add_epi16(u20, u21);
+ v21 = _mm_sub_epi16(u20, u21);
+ v22 = _mm_sub_epi16(u23, u22);
+ v23 = _mm_add_epi16(u23, u22);
+
+ v24 = _mm_add_epi16(u24, u25);
+ v25 = _mm_sub_epi16(u24, u25);
+ v26 = _mm_sub_epi16(u27, u26);
+ v27 = _mm_add_epi16(u27, u26);
+
+ v28 = _mm_add_epi16(u28, u29);
+ v29 = _mm_sub_epi16(u28, u29);
+ v30 = _mm_sub_epi16(u31, u30);
+ v31 = _mm_add_epi16(u31, u30);
+
+ {
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+ }
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+
+ u24 = _mm_add_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u27 = _mm_sub_epi16(v24, v27);
+
+ u28 = _mm_sub_epi16(v31, v28);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+ u31 = _mm_add_epi16(v28, v31);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(u16, u23);
+ out[1] = _mm_add_epi16(u17, u22);
+ out[2] = _mm_add_epi16(u18, u21);
+ out[3] = _mm_add_epi16(u19, u20);
+ out[4] = _mm_sub_epi16(u19, u20);
+ out[5] = _mm_sub_epi16(u18, u21);
+ out[6] = _mm_sub_epi16(u17, u22);
+ out[7] = _mm_sub_epi16(u16, u23);
+
+ out[8] = _mm_sub_epi16(u31, u24);
+ out[9] = _mm_sub_epi16(u30, u25);
+ out[10] = _mm_sub_epi16(u29, u26);
+ out[11] = _mm_sub_epi16(u28, u27);
+ out[12] = _mm_add_epi16(u27, u28);
+ out[13] = _mm_add_epi16(u26, u29);
+ out[14] = _mm_add_epi16(u25, u30);
+ out[15] = _mm_add_epi16(u24, u31);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+ butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+ butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+ butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
+ }
+}
+
+static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_full_8x32_quarter_1(in, temp);
+ idct32_full_8x32_quarter_2(in, &temp[8]);
+ add_sub_butterfly(temp, out, 16);
+}
+
+static void idct32_full_8x32(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[32];
+ idct32_full_8x32_quarter_1_2(in, temp);
+ idct32_full_8x32_quarter_3_4(in, &temp[16]);
+ add_sub_butterfly(temp, out, 32);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ in[i] = load_input_data(input);
+ in[i + 8] = load_input_data(input + 8);
+ in[i + 16] = load_input_data(input + 16);
+ in[i + 24] = load_input_data(input + 24);
+ input += 32;
+ }
+}
+
+void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[128], in[32];
+ int i, j;
+
+ // rows
+ for (i = 0; i < 4; ++i) {
+ load_buffer_8x32(input, in);
+ input += 32 << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ idct32_full_8x32(in, col + (i << 5));
+ }
+
+ // columns
+ for (i = 0; i < 4; ++i) {
+ j = i << 3;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ idct32_full_8x32(in, in);
+ store_buffer_8x32(in, dest, stride);
+ dest += 8;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 000000000..f0668e6f3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,112 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+%if CONFIG_HIGHBITDEPTH
+ mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+%else
+ mova m0, [inputq + 0]
+ mova m1, [inputq + 16]
+%endif
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 000000000..bf8150e2a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,915 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+ const __m128i blimit =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (aom_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
+ p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+
+ const __m128i thresh =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+ const __m128i blimit =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+ p256_4 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+ p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+ q256_4 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+
+ p4 = _mm256_castsi256_si128(p256_4);
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+ q4 = _mm256_castsi256_si128(q256_4);
+
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
+ flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (aom_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+ q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+ p5 = _mm256_castsi256_si128(p256_5);
+ q5 = _mm256_castsi256_si128(q256_5);
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+ q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+ p6 = _mm256_castsi256_si128(p256_6);
+ q6 = _mm256_castsi256_si128(q256_6);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+ q256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+ p7 = _mm256_castsi256_si128(p256_7);
+ q7 = _mm256_castsi256_si128(q256_7);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(
+ four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)),
+ 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)),
+ 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)),
+ 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)),
+ 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)),
+ 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)),
+ 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ }
+ _mm256_zeroupper();
+}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 000000000..7e134dc63
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,1892 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+#if CONFIG_PARALLEL_DEBLOCKING
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK4 \
+ do { \
+ /* (abs(q1 - q0), abs(p1 - p0) */ \
+ __m128i flat = abs_diff(q1p1, q0p0); \
+ /* abs(p1 - q1), abs(p0 - q0) */ \
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
+ __m128i abs_p0q0, abs_p1q1; \
+ \
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
+ hev = \
+ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+ hev = _mm_cmpgt_epi16(hev, thresh); \
+ hev = _mm_packs_epi16(hev, hev); \
+ \
+ /* const int8_t mask = filter_mask2(*limit, *blimit, */ \
+ /* p1, p0, q0, q1); */ \
+ abs_p0q0 = \
+ _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
+ abs_p1q1 = \
+ _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
+ mask = _mm_unpacklo_epi64(mask, flat); \
+ mask = _mm_subs_epu8(mask, limit); \
+ mask = _mm_cmpeq_epi8(mask, zero); \
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
+ } while (0)
+#endif // CONFIG_PARALLEL_DEBLOCKING
+
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK \
+ do { \
+ /* (abs(q1 - q0), abs(p1 - p0) */ \
+ __m128i flat = abs_diff(q1p1, q0p0); \
+ /* abs(p1 - q1), abs(p0 - q0) */ \
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
+ __m128i abs_p0q0, abs_p1q1, work; \
+ \
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
+ hev = \
+ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+ hev = _mm_cmpgt_epi16(hev, thresh); \
+ hev = _mm_packs_epi16(hev, hev); \
+ \
+ /* const int8_t mask = filter_mask(*limit, *blimit, */ \
+ /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
+ abs_p0q0 = \
+ _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
+ abs_p1q1 = \
+ _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
+ /* abs(p3 - p2), abs(p2 - p1) */ \
+ work = abs_diff(p3p2, p2p1); \
+ flat = _mm_max_epu8(work, flat); \
+ /* abs(q3 - q2), abs(q2 - q1) */ \
+ work = abs_diff(q3q2, q2q1); \
+ flat = _mm_max_epu8(work, flat); \
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
+ mask = _mm_unpacklo_epi64(mask, flat); \
+ mask = _mm_subs_epu8(mask, limit); \
+ mask = _mm_cmpeq_epi8(mask, zero); \
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
+ } while (0)
+
+#define FILTER4 \
+ do { \
+ const __m128i t3t4 = \
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
+ const __m128i t80 = _mm_set1_epi8(0x80); \
+ __m128i filter, filter2filter1, work; \
+ \
+ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
+ qs1qs0 = _mm_xor_si128(q1q0, t80); \
+ \
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
+ work = _mm_subs_epi8(ps1ps0, qs1qs0); \
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
+ filter = _mm_and_si128(filter, mask); /* & mask */ \
+ filter = _mm_unpacklo_epi64(filter, filter); \
+ \
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
+ \
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
+ filter = _mm_unpacklo_epi8(filter, filter); \
+ filter = _mm_srai_epi16(filter, 9); /* round */ \
+ filter = _mm_packs_epi16(filter, filter); \
+ filter = _mm_andnot_si128(hev, filter); \
+ \
+ hev = _mm_unpackhi_epi64(filter2filter1, filter); \
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
+ \
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
+ qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
+ ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
+ qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
+ ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
+ } while (0)
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ const __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ __m128i p3p2, p2p1, q3q2, q2q1;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+#if !CONFIG_PARALLEL_DEBLOCKING
+ p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+ _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+#if !CONFIG_PARALLEL_DEBLOCKING
+ q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
+ FILTER_HEV_MASK;
+#else // CONFIG_PARALLEL_DEBLOCKING
+ FILTER_HEV_MASK4;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ FILTER4;
+
+ _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
+ _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
+ _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ const __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i x0, x1, x2, x3;
+#if !CONFIG_PARALLEL_DEBLOCKING
+ __m128i p3p2, p2p1, q3q2, q2q1;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+
+ // Transpose 8x8
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x0 = _mm_unpacklo_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x2 = _mm_unpackhi_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+ q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
+ FILTER_HEV_MASK;
+#else // CONFIG_PARALLEL_DEBLOCKING
+ FILTER_HEV_MASK4;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ FILTER4;
+
+ // Transpose 8x4 to 4x8
+ // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
+ // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+ // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
+ x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+ // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+ *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+
+ *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+}
+
+void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ // Filter1 >> 3
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+ flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+ work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+ // loopfilter done
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+ const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+ const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+ const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+ const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+ const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+ const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+ const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
+
+ const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+ const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+ const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+ const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+ const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+ const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+ const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
+
+ __m128i f_lo;
+ __m128i f_hi;
+
+ f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
+ f_lo =
+ _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+ f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
+
+ f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
+ f_hi =
+ _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+ f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
+
+ p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+ q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ // filter_mask and hev_mask
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+ }
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 11);
+ filter1 = _mm_packs_epi16(filter1, filter1);
+
+ // Filter2 >> 3
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 11);
+ filter2 = _mm_packs_epi16(filter2, zero);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ filt = _mm_unpacklo_epi8(zero, filt);
+ filt = _mm_srai_epi16(filt, 9);
+ filt = _mm_packs_epi16(filt, zero);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+ }
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ int i = 0;
+
+ do {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ src += 8;
+ } while (++i < 2);
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ }
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+ const __m128i zero = _mm_set1_epi16(0);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ __m128i p3, p2, q2, q3;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ __m128i p1, p0, q0, q1;
+ __m128i mask, hev, flat;
+#if !CONFIG_PARALLEL_DEBLOCKING
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+#if !CONFIG_PARALLEL_DEBLOCKING
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ // filter_mask and hev_mask
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+#if !CONFIG_PARALLEL_DEBLOCKING
+ __m128i work;
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ }
+}
+
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+ // 2-way interleave w/hoisting of unpacks
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
+
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
+
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x14 = _mm_unpacklo_epi32(x12, x13); // 15
+ x15 = _mm_unpackhi_epi32(x12, x13); // 16
+
+ // Store first 4-line result
+ _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ x12 = _mm_unpackhi_epi16(x8, x9);
+ x13 = _mm_unpackhi_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store second 4-line result
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+#if CONFIG_PARALLEL_DEBLOCKING
+#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
+#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
+#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
+#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
+#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
+#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
+enum { ROTATE_DWORD_RIGHT = 0x39 };
+static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
+ const uint8_t *pSrc,
+ const ptrdiff_t srcStride) {
+ for (uint32_t idx = 0; idx < 2; idx += 1) {
+ __m128i r0, r1, r2, r3;
+ // load data
+ r0 = movq(pSrc);
+ r1 = movq(pSrc + srcStride);
+ r2 = movq(pSrc + srcStride * 2);
+ r3 = movq(pSrc + srcStride * 3);
+ // transpose
+ r0 = punpcklbw(r0, r1);
+ r2 = punpcklbw(r2, r3);
+ r1 = punpckhwd(r0, r2);
+ r0 = punpcklwd(r0, r2);
+ // store data
+ movd(pDst, r0);
+ r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride, r0);
+ r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride * 2, r0);
+ r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride * 3, r0);
+ movd(pDst + dstStride * 4, r1);
+ r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride * 5, r1);
+ r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride * 6, r1);
+ r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+ movd(pDst + dstStride * 7, r1);
+ // advance the pointers
+ pDst += dstStride * 8;
+ pSrc += 8;
+ }
+}
+
+#endif // CONFIG_PARALLEL_DEBLOCKING
+static INLINE void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ _mm_storel_pd((double *)(out + 0 * out_p),
+ _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
+ _mm_storeh_pd((double *)(out + 1 * out_p),
+ _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ _mm_storel_pd((double *)(out + 2 * out_p),
+ _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
+ _mm_storeh_pd((double *)(out + 3 * out_p),
+ _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ _mm_storel_pd((double *)(out + 4 * out_p),
+ _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
+ _mm_storeh_pd((double *)(out + 5 * out_p),
+ _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ _mm_storel_pd((double *)(out + 6 * out_p),
+ _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
+ _mm_storeh_pd((double *)(out + 7 * out_p),
+ _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ unsigned char *src[2];
+ unsigned char *dst[2];
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+#if !CONFIG_PARALLEL_DEBLOCKING
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, p, 2);
+#else // CONFIG_PARALLEL_DEBLOCKING
+ transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
+#endif // !CONFIG_PARALLEL_DEBLOCKING
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
+ unsigned char *src[1];
+ unsigned char *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ transpose(src, 8, dst, p, 1);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, p, 2);
+}
+
+void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ transpose(src, p, dst, 8, 2);
+
+ // Loop filtering
+ aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ transpose(src, 8, dst, p, 2);
+}
+
+void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+ transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+
+ // Transpose back
+ transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+ transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 000000000..5166e9e0a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
+ __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+ __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+ return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
+ __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
+ __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
+ __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
+ temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
+ temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
+ temp1 = _mm_unpacklo_epi32(temp1, temp2);
+ return _mm_unpacklo_epi64(temp3, temp1);
+}
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height);
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+ const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height);
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+ const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height);
+
+#define MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
+ m, n); \
+ }
+
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif // CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+
+#define MASKSAD8XN_SSSE3(n) \
+ unsigned int aom_masked_sad8x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+ msk_stride, n); \
+ }
+
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+
+#define MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_masked_sad4x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+ msk_stride, n); \
+ }
+
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 16
+// Assumes values in m are <=64
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int y, x;
+ __m128i a, b, m, temp1, temp2;
+ __m128i res = _mm_setzero_si128();
+ __m128i one = _mm_set1_epi16(1);
+ // For each row
+ for (y = 0; y < height; y++) {
+ // Covering the full width
+ for (x = 0; x < width; x += 16) {
+ // Load a, b, m in xmm registers
+ a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+ b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+ m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
+
+ // Calculate the difference between a & b
+ temp1 = _mm_subs_epu8(a, b);
+ temp2 = _mm_subs_epu8(b, a);
+ temp1 = _mm_or_si128(temp1, temp2);
+
+ // Multiply by m and add together
+ temp2 = _mm_maddubs_epi16(temp1, m);
+ // Pad out row result to 32 bit integers & add to running total
+ res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
+ }
+ // Move onto the next row
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ // sad = (sad + 31) >> 6;
+ return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+ const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height) {
+ int y;
+ __m128i a, b, m, temp1, temp2, row_res;
+ __m128i res = _mm_setzero_si128();
+ __m128i one = _mm_set1_epi16(1);
+ // Add the masked SAD for 2 rows at a time
+ for (y = 0; y < height; y += 2) {
+ // Load a, b, m in xmm registers
+ a = width8_load_2rows(a_ptr, a_stride);
+ b = width8_load_2rows(b_ptr, b_stride);
+ m = width8_load_2rows(m_ptr, m_stride);
+
+ // Calculate the difference between a & b
+ temp1 = _mm_subs_epu8(a, b);
+ temp2 = _mm_subs_epu8(b, a);
+ temp1 = _mm_or_si128(temp1, temp2);
+
+ // Multiply by m and add together
+ row_res = _mm_maddubs_epi16(temp1, m);
+
+ // Pad out row result to 32 bit integers & add to running total
+ res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+ // Move onto the next rows
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ // sad = (sad + 31) >> 6;
+ return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+ const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height) {
+ int y;
+ __m128i a, b, m, temp1, temp2, row_res;
+ __m128i res = _mm_setzero_si128();
+ __m128i one = _mm_set1_epi16(1);
+ // Add the masked SAD for 4 rows at a time
+ for (y = 0; y < height; y += 4) {
+ // Load a, b, m in xmm registers
+ a = width4_load_4rows(a_ptr, a_stride);
+ b = width4_load_4rows(b_ptr, b_stride);
+ m = width4_load_4rows(m_ptr, m_stride);
+
+ // Calculate the difference between a & b
+ temp1 = _mm_subs_epu8(a, b);
+ temp2 = _mm_subs_epu8(b, a);
+ temp1 = _mm_or_si128(temp1, temp2);
+
+ // Multiply by m and add together
+ row_res = _mm_maddubs_epi16(temp1, m);
+
+ // Pad out row result to 32 bit integers & add to running total
+ res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+ // Move onto the next rows
+ a_ptr += a_stride * 4;
+ b_ptr += b_stride * 4;
+ m_ptr += m_stride * 4;
+ }
+ // Pad out row result to 32 bit integers & add to running total
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ // sad = (sad + 31) >> 6;
+ return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
+ int stride) {
+ __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+ __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+ return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width, int height);
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+ const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
+ msk_stride, m, n); \
+ }
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *msk, int msk_stride) { \
+ return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+ msk_stride, n); \
+ }
+
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 8
+// Assumes values in m are <=64
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width, int height) {
+ int y, x;
+ __m128i a, b, m, temp1, temp2;
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+ __m128i res = _mm_setzero_si128();
+ // For each row
+ for (y = 0; y < height; y++) {
+ // Covering the full width
+ for (x = 0; x < width; x += 8) {
+ // Load a, b, m in xmm registers
+ a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+ b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+ m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
+ _mm_setzero_si128());
+
+ // Calculate the difference between a & b
+ temp1 = _mm_subs_epu16(a, b);
+ temp2 = _mm_subs_epu16(b, a);
+ temp1 = _mm_or_si128(temp1, temp2);
+
+ // Add result of multiplying by m and add pairs together to running total
+ res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+ }
+ // Move onto the next row
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ // sad = (sad + 31) >> 6;
+ return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+ const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height) {
+ int y;
+ __m128i a, b, m, temp1, temp2;
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+ __m128i res = _mm_setzero_si128();
+ // Add the masked SAD for 2 rows at a time
+ for (y = 0; y < height; y += 2) {
+ // Load a, b, m in xmm registers
+ a = highbd_width4_load_2rows(a_ptr, a_stride);
+ b = highbd_width4_load_2rows(b_ptr, b_stride);
+ temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
+ temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
+ m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
+ _mm_setzero_si128());
+
+ // Calculate the difference between a & b
+ temp1 = _mm_subs_epu16(a, b);
+ temp2 = _mm_subs_epu16(b, a);
+ temp1 = _mm_or_si128(temp1, temp2);
+
+ // Multiply by m and add together
+ res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+
+ // Move onto the next rows
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ res = _mm_hadd_epi32(res, _mm_setzero_si128());
+ // sad = (sad + 31) >> 6;
+ return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 000000000..fe14597f6
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1948 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_filter.h"
+
+// Half pixel shift
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2)
+
+/*****************************************************************************
+ * Horizontal additions
+ *****************************************************************************/
+
+static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
+ v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+ return _mm_cvtsi128_si64(v_q);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_q);
+ return tmp;
+ }
+#endif
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
+ const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+ const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+ const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+ return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
+ uint32_t *sse, int w, int h) {
+ int64_t sum64;
+ uint64_t sse64;
+
+ // Horizontal sum
+ sum64 = hsum_epi32_si32(v_sum_d);
+ sse64 = hsum_epi64_si64(v_sse_q);
+
+ sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+ // Round
+ sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute the variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+/*****************************************************************************
+ * n*16 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variancewxh_ssse3(
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+ int ii, jj;
+
+ const __m128i v_zero = _mm_setzero_si128();
+
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+
+ assert((w % 16) == 0);
+
+ for (ii = 0; ii < h; ii++) {
+ for (jj = 0; jj < w; jj += 16) {
+ // Load inputs - 8 bits
+ const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj));
+ const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj));
+ const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj));
+
+ // Unpack to 16 bits - still containing max 8 bits
+ const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+ const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+ const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+ const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+ // Difference: [-255, 255]
+ const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+ const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+ // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+ const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+ const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+ const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+ // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+ const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+ const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+ // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+ const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+ // Unpack Squared error to 64 bits
+ const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+ const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+ // Accumulate
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+ }
+
+ // Move on to next row
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+#define MASKED_VARWXH(W, H) \
+ unsigned int aom_masked_variance##W##x##H##_ssse3( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \
+ H, sse); \
+ }
+
+MASKED_VARWXH(16, 8)
+MASKED_VARWXH(16, 16)
+MASKED_VARWXH(16, 32)
+MASKED_VARWXH(32, 16)
+MASKED_VARWXH(32, 32)
+MASKED_VARWXH(32, 64)
+MASKED_VARWXH(64, 32)
+MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+MASKED_VARWXH(64, 128)
+MASKED_VARWXH(128, 64)
+MASKED_VARWXH(128, 128)
+#endif // CONFIG_EXT_PARTITION
+
+/*****************************************************************************
+ * 8 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance8xh_ssse3(
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int h, unsigned int *sse) {
+ int ii;
+
+ const __m128i v_zero = _mm_setzero_si128();
+
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+
+ for (ii = 0; ii < h; ii++) {
+ // Load inputs - 8 bits
+ const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m);
+
+ // Unpack to 16 bits - still containing max 8 bits
+ const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+ const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+ const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+ // Difference: [-255, 255]
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+ // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+ const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+ const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+ // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+ const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+ // Unpack Squared error to 64 bits
+ const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+ const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+ // Accumulate
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+ // Move on to next row
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+#define MASKED_VAR8XH(H) \
+ unsigned int aom_masked_variance8x##H##_ssse3( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+ sse); \
+ }
+
+MASKED_VAR8XH(4)
+MASKED_VAR8XH(8)
+MASKED_VAR8XH(16)
+
+/*****************************************************************************
+ * 4 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance4xh_ssse3(
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int h, unsigned int *sse) {
+ int ii;
+
+ const __m128i v_zero = _mm_setzero_si128();
+
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+
+ assert((h % 2) == 0);
+
+ for (ii = 0; ii < h / 2; ii++) {
+ // Load 2 input rows - 8 bits
+ const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a);
+ const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b);
+ const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+ const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride));
+ const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride));
+ const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
+
+ // Interleave 2 rows into a single register
+ const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
+ const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b);
+ const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+ // Unpack to 16 bits - still containing max 8 bits
+ const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+ const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+ const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+ // Difference: [-255, 255]
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+ // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+ const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+ const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+ // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+ const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+ // Unpack Squared error to 64 bits
+ const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+ const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+ // Accumulate
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+ // Move on to next 2 row
+ a += a_stride * 2;
+ b += b_stride * 2;
+ m += m_stride * 2;
+ }
+
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+#define MASKED_VAR4XH(H) \
+ unsigned int aom_masked_variance4x##H##_ssse3( \
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+ sse); \
+ }
+
+MASKED_VAR4XH(4)
+MASKED_VAR4XH(8)
+
+#if CONFIG_HIGHBITDEPTH
+
+// Main calculation for n*8 wide blocks
+static INLINE void highbd_masked_variance64_ssse3(
+ const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) {
+ int ii, jj;
+
+ const __m128i v_zero = _mm_setzero_si128();
+
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+
+ assert((w % 8) == 0);
+
+ for (ii = 0; ii < h; ii++) {
+ for (jj = 0; jj < w; jj += 8) {
+ // Load inputs - 8 bits
+ const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj));
+ const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj));
+ const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj));
+
+ // Unpack m to 16 bits - still containing max 8 bits
+ const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+ // Difference: [-4095, 4095]
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+ // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
+ const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+ // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+ const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+ const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+ const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+ const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+ const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+ const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+ const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+ // Square and sum the errors -> 36bits * 4 = 38bits
+ __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+ v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+ v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+ v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+ v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+ v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+ v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+ v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+ v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+ v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+ // Accumulate
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+ }
+
+ // Move on to next row
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ // Horizontal sum
+ *sum = hsum_epi32_si64(v_sum_d);
+ *sse = hsum_epi64_si64(v_sse_q);
+
+ // Round
+ *sum = (*sum >= 0) ? *sum : -*sum;
+ *sum = ROUND_POWER_OF_TWO(*sum, 6);
+ *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+// Main calculation for 4 wide blocks
+static INLINE void highbd_masked_variance64_4wide_ssse3(
+ const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) {
+ int ii;
+
+ const __m128i v_zero = _mm_setzero_si128();
+
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+
+ assert((h % 2) == 0);
+
+ for (ii = 0; ii < h / 2; ii++) {
+ // Load 2 input rows - 8 bits
+ const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+ const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+ const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+ const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
+
+ // Interleave 2 rows into a single register
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w);
+ const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+ // Unpack to 16 bits - still containing max 8 bits
+ const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+ // Difference: [-4095, 4095]
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+ // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
+ const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+ // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+ const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+ const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+ const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+ const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+ const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+ const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+ const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+ // Square and sum the errors -> 36bits * 4 = 38bits
+ __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+ v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+ v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+ v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+ v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+ v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+ v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+ v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+ v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+ v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+ // Accumulate
+ v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+ v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+
+ // Move on to next row
+ a += a_stride * 2;
+ b += b_stride * 2;
+ m += m_stride * 2;
+ }
+
+ // Horizontal sum
+ *sum = hsum_epi32_si32(v_sum_d);
+ *sse = hsum_epi64_si64(v_sse_q);
+
+ // Round
+ *sum = (*sum >= 0) ? *sum : -*sum;
+ *sum = ROUND_POWER_OF_TWO(*sum, 6);
+ *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+static INLINE unsigned int highbd_masked_variancewxh_ssse3(
+ const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+ uint64_t sse64;
+ int64_t sum64;
+
+ if (w == 4)
+ highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+ h, &sum64, &sse64);
+ else
+ highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+ &sum64, &sse64);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute and return variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
+ const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+ uint64_t sse64;
+ int64_t sum64;
+
+ if (w == 4)
+ highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+ h, &sum64, &sse64);
+ else
+ highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+ &sum64, &sse64);
+
+ // Normalise
+ sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute and return variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
+ const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+ uint64_t sse64;
+ int64_t sum64;
+
+ if (w == 4)
+ highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+ h, &sum64, &sse64);
+ else
+ highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+ &sum64, &sse64);
+
+ sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute and return variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+#define HIGHBD_MASKED_VARWXH(W, H) \
+ unsigned int aom_highbd_masked_variance##W##x##H##_ssse3( \
+ const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
+ return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+ m_stride, W, H, sse); \
+ } \
+ \
+ unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3( \
+ const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
+ return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+ m_stride, W, H, sse); \
+ } \
+ \
+ unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3( \
+ const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
+ const uint8_t *m, int m_stride, unsigned int *sse) { \
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
+ return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+ m_stride, W, H, sse); \
+ }
+
+HIGHBD_MASKED_VARWXH(4, 4)
+HIGHBD_MASKED_VARWXH(4, 8)
+HIGHBD_MASKED_VARWXH(8, 4)
+HIGHBD_MASKED_VARWXH(8, 8)
+HIGHBD_MASKED_VARWXH(8, 16)
+HIGHBD_MASKED_VARWXH(16, 8)
+HIGHBD_MASKED_VARWXH(16, 16)
+HIGHBD_MASKED_VARWXH(16, 32)
+HIGHBD_MASKED_VARWXH(32, 16)
+HIGHBD_MASKED_VARWXH(32, 32)
+HIGHBD_MASKED_VARWXH(32, 64)
+HIGHBD_MASKED_VARWXH(64, 32)
+HIGHBD_MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKED_VARWXH(64, 128)
+HIGHBD_MASKED_VARWXH(128, 64)
+HIGHBD_MASKED_VARWXH(128, 128)
+#endif // CONFIG_EXT_PARTITION
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Sub pixel versions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
+ __m128i v_filter_b);
+
+static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
+ const __m128i v_filter_b) {
+ (void)v_filter_b;
+ return _mm_avg_epu8(v_a_b, v_b_b);
+}
+
+static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
+ const __m128i v_filter_b) {
+ const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+ __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b);
+ __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
+ __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
+ __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
+ __m128i v_res_lo_w =
+ _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
+ __m128i v_res_hi_w =
+ _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS);
+ return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
+}
+
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
+ const __m128i v_b_lo_b,
+ const __m128i v_filter_b, __m128i *v_res_w) {
+ const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+ __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
+ __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
+ *v_res_w =
+ _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
+}
+
+static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
+ const __m128i v_m_b, __m128i *v_sum_d,
+ __m128i *v_sse_q) {
+ const __m128i v_zero = _mm_setzero_si128();
+ // Unpack to 16 bits - still containing max 8 bits
+ const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+ const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+ const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+ const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+ // Difference: [-255, 255]
+ const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+ const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+ // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+ const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+ const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+ const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+ // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+ const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+ const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+ // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+ const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+ // Unpack Squared error to 64 bits
+ const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+ const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+ // Accumulate
+ *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d);
+ *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d);
+ *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q);
+ *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
+}
+
+// Functions for width (W) >= 16
+unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride,
+ int yoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int w, int h,
+ filter_fn_t filter_fn) {
+ int i, j;
+ __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filter_b = _mm_set1_epi16(
+ (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ for (j = 0; j < w; j += 16) {
+ // Load the first row ready
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+ // Process 2 rows at a time
+ for (i = 0; i < h; i += 2) {
+ // Load the next row apply the filter
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
+ v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+ // Load the dst and msk for the variance calculation
+ v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next row apply the filter
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
+ v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
+ // Load the dst and msk for the variance calculation
+ v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+ v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride));
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next block of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ // Reset to the top of the block
+ src -= src_stride * h;
+ dst -= dst_stride * h;
+ msk -= msk_stride * h;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride,
+ int xoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int w, int h,
+ filter_fn_t filter_fn) {
+ int i, j;
+ __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filter_b = _mm_set1_epi16(
+ (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j += 16) {
+ // Load this row and one below & apply the filter to them
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
+ v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+
+ // Load the dst and msk for the variance calculation
+ v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ msk += msk_stride;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
+ filter_fn_t yfilter_fn) {
+ int i, j;
+ __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
+ __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filterx_b = _mm_set1_epi16(
+ (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
+ const __m128i v_filtery_b = _mm_set1_epi16(
+ (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (j = 0; j < w; j += 16) {
+ // Load the first row ready
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
+ v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+ // Process 2 rows at a time
+ for (i = 0; i < h; i += 2) {
+ // Load the next row & apply the filter
+ v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+ v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
+ v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
+ // Load the dst and msk for the variance calculation
+ v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+ // Complete the calculation for this row and add it to the running total
+ v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next row & apply the filter
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+ v_src1_b =
+ _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
+ v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+ // Load the dst and msk for the variance calculation
+ v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+ v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j));
+ // Complete the calculation for this row and add it to the running total
+ v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next block of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ // Reset to the top of the block
+ src -= src_stride * h;
+ dst -= dst_stride * h;
+ msk -= msk_stride * h;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride,
+ int yoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
+ __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+ __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ // Load the first row of src data ready
+ v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+ for (i = 0; i < h; i += 4) {
+ // Load the rest of the source data for these rows
+ v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+ v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+ v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+ v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+ v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
+ // Load the dst data
+ v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+ v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+ v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+ v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+ v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+ v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+ v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+ // Load the mask data
+ v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+ v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+ v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+ v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+ v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+ v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+ v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+ // Apply the y filter
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
+ v_src2_b =
+ _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+ _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+ v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
+ } else {
+ v_src2_b =
+ _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+ _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
+ apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
+ v_src2_b =
+ _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
+ _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+ apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
+ v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
+ }
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ msk += msk_stride * 4;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride,
+ int yoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
+ __m128i v_dst_b = _mm_setzero_si128();
+ __m128i v_msk_b = _mm_setzero_si128();
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ // Load the first row of src data ready
+ v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+ for (i = 0; i < h; i += 2) {
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ // Load the rest of the source data for these rows
+ v_src1_b = _mm_or_si128(
+ _mm_slli_si128(v_src0_b, 8),
+ _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
+ v_src0_b = _mm_or_si128(
+ _mm_slli_si128(v_src1_b, 8),
+ _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
+ // Apply the y filter
+ v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
+ } else {
+ // Load the data and apply the y filter
+ v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+ apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
+ v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
+ v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
+ }
+ // Load the dst data
+ v_dst_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride,
+ int xoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+ __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+ __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+ __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+ bilinear_filters_2t[xoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i += 4) {
+ // Load the src data
+ v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+ v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+ v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+ v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+ v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+ v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+ v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+ // Load the dst data
+ v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+ v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+ v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+ v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+ v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+ v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+ v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+ // Load the mask data
+ v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+ v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+ v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+ v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+ v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+ v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+ v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+ v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+ v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+ apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w);
+ v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+ }
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ msk += msk_stride * 4;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride,
+ int xoffset, const uint8_t *dst,
+ int dst_stride, const uint8_t *msk,
+ int msk_stride, unsigned int *sse,
+ int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
+ __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+ bilinear_filters_2t[xoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i += 2) {
+ // Load the src data
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src));
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+ v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+ v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+ apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w);
+ v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+ }
+ // Load the dst data
+ v_dst_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse, int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+ __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+ __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b;
+ __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b;
+ __m128i v_xres_b[2];
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+ bilinear_filters_2t[xoffset][0]);
+ __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i += 4) {
+ // Load the src data
+ v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+ v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+ v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+ v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+ v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+ v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+ v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+ v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+ v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+ apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w);
+ v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+ }
+ // Move onto the next set of rows
+ src += src_stride * 4;
+ }
+ // Load one more row to be used in the y filter
+ v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b),
+ _mm_setr_epi32(-1, 0, 0, 0));
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+ v_extra_row_b =
+ _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
+ _mm_setr_epi32(-1, 0, 0, 0));
+ }
+
+ for (i = 0; i < h; i += 4) {
+ if (h == 8 && i == 0) {
+ v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4),
+ _mm_srli_si128(v_xres_b[1], 12));
+ } else {
+ v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4),
+ v_extra_row_b);
+ }
+ // Apply the y filter
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
+ } else {
+ v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
+ }
+
+ // Load the dst data
+ v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+ v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+ v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+ v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+ v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+ v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+ v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+ // Load the mask data
+ v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+ v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+ v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+ v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+ v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+ v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+ v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ dst += dst_stride * 4;
+ msk += msk_stride * 4;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse, int h) {
+ int i;
+ __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
+ __m128i v_src0_shift_b, v_src1_shift_b;
+ __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+ bilinear_filters_2t[xoffset][0]);
+ __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ // Load the first block of src data
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src));
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+ v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+ v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+ apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+ v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+ }
+ for (i = 0; i < h; i += 4) {
+ // Load the next block of src data
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+ v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+ v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+ apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+ v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+ }
+ // Apply the y filter to the previous block
+ v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
+ _mm_slli_si128(v_xres1_b, 8));
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
+ } else {
+ v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
+ }
+ // Load the dst data
+ v_dst_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next block of src data
+ v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
+ v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+ v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
+ v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+ v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+ v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+ } else {
+ apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+ apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+ v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+ }
+ // Apply the y filter to the previous block
+ v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
+ _mm_slli_si128(v_xres0_b, 8));
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
+ } else {
+ v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
+ }
+ // Load the dst data
+ v_dst_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+ // Compute the sum and SSE
+ sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ msk += msk_stride * 4;
+ }
+ return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// For W >=16
+#define MASK_SUBPIX_VAR_LARGE(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ assert(W % 16 == 0); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) \
+ return aom_masked_variance##W##x##H##_ssse3( \
+ src, src_stride, dst, dst_stride, msk, msk_stride, sse); \
+ else if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_masked_subpel_varWxH_xzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, apply_filter_avg); \
+ else \
+ return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \
+ dst_stride, msk, msk_stride, \
+ sse, W, H, apply_filter); \
+ } else if (yoffset == 0) { \
+ if (xoffset == HALF_PIXEL_OFFSET) \
+ return aom_masked_subpel_varWxH_yzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, apply_filter_avg); \
+ else \
+ return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \
+ dst_stride, msk, msk_stride, \
+ sse, W, H, apply_filter); \
+ } else if (xoffset == HALF_PIXEL_OFFSET) { \
+ if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \
+ dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \
+ apply_filter_avg); \
+ else \
+ return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, apply_filter_avg, apply_filter); \
+ } else { \
+ if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, apply_filter, apply_filter_avg); \
+ else \
+ return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, apply_filter, apply_filter); \
+ } \
+ }
+
+// For W < 16
+#define MASK_SUBPIX_VAR_SMALL(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ assert(W == 4 || W == 8); \
+ if (xoffset == 0 && yoffset == 0) \
+ return aom_masked_variance##W##x##H##_ssse3( \
+ src, src_stride, dst, dst_stride, msk, msk_stride, sse); \
+ else if (xoffset == 0) \
+ return aom_masked_subpel_var##W##xH_xzero( \
+ src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+ else if (yoffset == 0) \
+ return aom_masked_subpel_var##W##xH_yzero( \
+ src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+ else \
+ return aom_masked_subpel_var##W##xH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+ sse, H); \
+ }
+
+MASK_SUBPIX_VAR_SMALL(4, 4)
+MASK_SUBPIX_VAR_SMALL(4, 8)
+MASK_SUBPIX_VAR_SMALL(8, 4)
+MASK_SUBPIX_VAR_SMALL(8, 8)
+MASK_SUBPIX_VAR_SMALL(8, 16)
+MASK_SUBPIX_VAR_LARGE(16, 8)
+MASK_SUBPIX_VAR_LARGE(16, 16)
+MASK_SUBPIX_VAR_LARGE(16, 32)
+MASK_SUBPIX_VAR_LARGE(32, 16)
+MASK_SUBPIX_VAR_LARGE(32, 32)
+MASK_SUBPIX_VAR_LARGE(32, 64)
+MASK_SUBPIX_VAR_LARGE(64, 32)
+MASK_SUBPIX_VAR_LARGE(64, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_LARGE(64, 128)
+MASK_SUBPIX_VAR_LARGE(128, 64)
+MASK_SUBPIX_VAR_LARGE(128, 128)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_HIGHBITDEPTH
+typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
+ uint32_t *sse, int w, int h);
+typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride,
+ unsigned int *sse);
+typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
+ __m128i v_filter_w);
+
+static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
+ const __m128i v_b_w,
+ const __m128i v_filter_w) {
+ (void)v_filter_w;
+ return _mm_avg_epu16(v_a_w, v_b_w);
+}
+
+static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
+ const __m128i v_b_w,
+ const __m128i v_filter_w) {
+ const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+ __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w);
+ __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
+ __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
+ __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
+ __m128i v_res_lo_d =
+ _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
+ __m128i v_res_hi_d =
+ _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS);
+ return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
+}
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
+ const __m128i v_b_lo_w,
+ const __m128i v_filter_w,
+ __m128i *v_res_d) {
+ const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+ __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
+ __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
+ *v_res_d =
+ _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
+}
+
+static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
+ const __m128i v_m_b, __m128i *v_sum_d,
+ __m128i *v_sse_q) {
+ const __m128i v_zero = _mm_setzero_si128();
+ const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+ // Difference: [-2^12, 2^12] => 13 bits (incld sign bit)
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+ // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits
+ const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+ // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+ const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+ const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+ const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+ const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+ const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+ const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+ const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+ // Square and sum the errors -> 36bits * 4 = 38bits
+ __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+ v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+ v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+ v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+ v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+ v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+ v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+ v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+ v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+ v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+ // Accumulate
+ *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
+ *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+}
+
+static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
+ __m128i v_sse_q,
+ uint32_t *sse, int w,
+ int h) {
+ int64_t sum64;
+ uint64_t sse64;
+
+ // Horizontal sum
+ sum64 = hsum_epi32_si32(v_sum_d);
+ sse64 = hsum_epi64_si64(v_sse_q);
+
+ sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+ // Round
+ sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+ // Normalise
+ sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute the variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
+ __m128i v_sse_q,
+ uint32_t *sse, int w,
+ int h) {
+ int64_t sum64;
+ uint64_t sse64;
+
+ // Horizontal sum
+ sum64 = hsum_epi32_si64(v_sum_d);
+ sse64 = hsum_epi64_si64(v_sse_q);
+
+ sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+ // Round
+ sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+ // Normalise
+ sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+ sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+ // Store the SSE
+ *sse = (uint32_t)sse64;
+ // Compute the variance
+ return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+// High bit depth functions for width (W) >= 8
+unsigned int aom_highbd_masked_subpel_varWxH_xzero(
+ const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+ int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+ int w, int h, highbd_filter_fn_t filter_fn,
+ highbd_calc_masked_var_t calc_var) {
+ int i, j;
+ __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filter_w =
+ _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ for (j = 0; j < w; j += 8) {
+ // Load the first row ready
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+ // Process 2 rows at a time
+ for (i = 0; i < h; i += 2) {
+ // Load the next row apply the filter
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
+ v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+ // Load the dst and msk for the variance calculation
+ v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next row apply the filter
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
+ v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
+ // Load the dst and msk for the variance calculation
+ v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+ v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride));
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next block of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ // Reset to the top of the block
+ src -= src_stride * h;
+ dst -= dst_stride * h;
+ msk -= msk_stride * h;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_highbd_masked_subpel_varWxH_yzero(
+ const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+ int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+ int w, int h, highbd_filter_fn_t filter_fn,
+ highbd_calc_masked_var_t calc_var) {
+ int i, j;
+ __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filter_w =
+ _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+ bilinear_filters_2t[xoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j += 8) {
+ // Load this row & apply the filter to them
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
+ v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+
+ // Load the dst and msk for the variance calculation
+ v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ msk += msk_stride;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
+ const uint16_t *src, int src_stride, int xoffset, int yoffset,
+ const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
+ highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
+ int i, j;
+ __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
+ __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ const __m128i v_filterx_w =
+ _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+ bilinear_filters_2t[xoffset][0]);
+ const __m128i v_filtery_w =
+ _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ for (j = 0; j < w; j += 8) {
+ // Load the first row ready
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
+ v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+ // Process 2 rows at a time
+ for (i = 0; i < h; i += 2) {
+ // Load the next row & apply the filter
+ v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+ v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
+ v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
+ // Load the dst and msk for the variance calculation
+ v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+ v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+ // Complete the calculation for this row and add it to the running total
+ v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next row & apply the filter
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+ v_src1_w =
+ _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
+ v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+ // Load the dst and msk for the variance calculation
+ v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+ v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j));
+ // Complete the calculation for this row and add it to the running total
+ v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next block of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ // Reset to the top of the block
+ src -= src_stride * h;
+ dst -= dst_stride * h;
+ msk -= msk_stride * h;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int aom_highbd_masked_subpel_var4xH_xzero(
+ const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+ int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+ int h, highbd_calc_masked_var_t calc_var) {
+ int i;
+ __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
+ __m128i v_dst_w, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ // Load the first row of src data ready
+ v_src0_w = _mm_loadl_epi64((const __m128i *)src);
+ for (i = 0; i < h; i += 2) {
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ // Load the rest of the source data for these rows
+ v_src1_w = _mm_or_si128(
+ _mm_slli_si128(v_src0_w, 8),
+ _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
+ v_src0_w = _mm_or_si128(
+ _mm_slli_si128(v_src1_w, 8),
+ _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
+ // Apply the y filter
+ v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
+ } else {
+ // Load the data and apply the y filter
+ v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+ highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
+ v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+ highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
+ v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
+ }
+ // Load the dst data
+ v_dst_w = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi32(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
+ // Compute the sum and SSE
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_highbd_masked_subpel_var4xH_yzero(
+ const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+ int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+ int h, highbd_calc_masked_var_t calc_var) {
+ int i;
+ __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
+ __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+ bilinear_filters_2t[xoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ for (i = 0; i < h; i += 2) {
+ // Load the src data
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src));
+ v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+ v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+ v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+ } else {
+ highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w,
+ &v_filtered0_d);
+ highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w,
+ &v_filtered1_d);
+ v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+ }
+ // Load the dst data
+ v_dst_w = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi32(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+ // Compute the sum and SSE
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 2;
+ dst += dst_stride * 2;
+ msk += msk_stride * 2;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
+ const uint16_t *src, int src_stride, int xoffset, int yoffset,
+ const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+ unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+ int i;
+ __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
+ __m128i v_src0_shift_w, v_src1_shift_w;
+ __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_q = _mm_setzero_si128();
+ __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+ bilinear_filters_2t[xoffset][0]);
+ __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+ bilinear_filters_2t[yoffset][0]);
+ assert(xoffset < BIL_SUBPEL_SHIFTS);
+ assert(yoffset < BIL_SUBPEL_SHIFTS);
+ // Load the first block of src data
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src));
+ v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+ v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+ v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+ } else {
+ highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+ &v_filtered0_d);
+ highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+ &v_filtered1_d);
+ v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+ }
+ for (i = 0; i < h; i += 4) {
+ // Load the next block of src data
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
+ v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
+ v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+ v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+ v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+ } else {
+ highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+ &v_filtered0_d);
+ highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+ &v_filtered1_d);
+ v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+ }
+ // Apply the y filter to the previous block
+ v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
+ _mm_slli_si128(v_xres1_w, 8));
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
+ } else {
+ v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
+ }
+ // Load the dst data
+ v_dst_w = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi32(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+ // Compute the sum and SSE
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+ // Load the next block of src data
+ v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
+ v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+ v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
+ v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+ // Apply the x filter
+ if (xoffset == HALF_PIXEL_OFFSET) {
+ v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+ v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+ v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+ } else {
+ highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+ &v_filtered0_d);
+ highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+ &v_filtered1_d);
+ v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+ }
+ // Apply the y filter to the previous block
+ v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
+ _mm_slli_si128(v_xres0_w, 8));
+ if (yoffset == HALF_PIXEL_OFFSET) {
+ v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
+ } else {
+ v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
+ }
+ // Load the dst data
+ v_dst_w = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+ _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+ // Load the mask data
+ v_msk_b = _mm_unpacklo_epi32(
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+ _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+ // Compute the sum and SSE
+ highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+ // Move onto the next set of rows
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ msk += msk_stride * 4;
+ }
+ return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// For W >=8
+#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \
+ unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse, highbd_calc_masked_var_t calc_var, \
+ highbd_variance_fn_t full_variance_function) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ assert(W % 8 == 0); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) \
+ return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
+ msk_stride, sse); \
+ else if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_highbd_masked_subpel_varWxH_xzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \
+ else \
+ return aom_highbd_masked_subpel_varWxH_xzero( \
+ src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \
+ W, H, highbd_apply_filter, calc_var); \
+ } else if (yoffset == 0) { \
+ if (xoffset == HALF_PIXEL_OFFSET) \
+ return aom_highbd_masked_subpel_varWxH_yzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \
+ else \
+ return aom_highbd_masked_subpel_varWxH_yzero( \
+ src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \
+ W, H, highbd_apply_filter, calc_var); \
+ } else if (xoffset == HALF_PIXEL_OFFSET) { \
+ if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \
+ dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \
+ highbd_apply_filter_avg, calc_var); \
+ else \
+ return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, highbd_apply_filter_avg, \
+ highbd_apply_filter, calc_var); \
+ } else { \
+ if (yoffset == HALF_PIXEL_OFFSET) \
+ return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, highbd_apply_filter, \
+ highbd_apply_filter_avg, calc_var); \
+ else \
+ return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \
+ msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \
+ calc_var); \
+ } \
+ }
+
+// For W < 8
+#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \
+ unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse, highbd_calc_masked_var_t calc_var, \
+ highbd_variance_fn_t full_variance_function) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ assert(W == 4); \
+ if (xoffset == 0 && yoffset == 0) \
+ return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
+ msk_stride, sse); \
+ else if (xoffset == 0) \
+ return aom_highbd_masked_subpel_var4xH_xzero( \
+ src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \
+ calc_var); \
+ else if (yoffset == 0) \
+ return aom_highbd_masked_subpel_var4xH_yzero( \
+ src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \
+ calc_var); \
+ else \
+ return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+ sse, H, calc_var); \
+ }
+
+#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \
+ unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+ sse, calc_masked_variance, \
+ aom_highbd_masked_variance##W##x##H##_ssse3); \
+ } \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+ sse, highbd_10_calc_masked_variance, \
+ aom_highbd_10_masked_variance##W##x##H##_ssse3); \
+ } \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+ unsigned int *sse) { \
+ return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+ sse, highbd_12_calc_masked_variance, \
+ aom_highbd_12_masked_variance##W##x##H##_ssse3); \
+ }
+
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
+#endif // CONFIG_EXT_PARTITION
+#endif
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 000000000..ad77f974c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ const int height) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_b = xx_loadl_32(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, const int width,
+ const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+#if CONFIG_EXT_PARTITION
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+#endif // CONFIG_EXT_PARTITION
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+#if CONFIG_EXT_PARTITION
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+#endif // CONFIG_EXT_PARTITION
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 000000000..efb3659cf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_filter.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int h) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_b = xx_loadl_32(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#if CONFIG_EXT_PARTITION
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+#endif // CONFIG_EXT_PARTITION
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+ const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum += xx_hsum_epi32_si64(v_sum_d);
+ *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ }
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 128) {
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
+ 32);
+ pre8 += 32 * pre_stride;
+ wsrc += 32 * 128;
+ mask += 32 * 128;
+ h -= 32;
+ } while (h > 0);
+ } else if (w == 64 && h >= 128) {
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
+ 64);
+ pre8 += 64 * pre_stride;
+ wsrc += 64 * 64;
+ mask += 64 * 64;
+ h -= 64;
+ } while (h > 0);
+ } else if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H) \
+ unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#if CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+#endif // CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 000000000..954a95b98
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,547 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+
+ vzeroupper
+
+ ; If we can skip this block, then just zero the output
+ cmp skipmp, 0
+ jne .blank
+
+%ifnidn %1, b_32x32
+
+ ; Special case for ncoeff == 16, as it is frequent and we can save on
+ ; not setting up a loop.
+ cmp ncoeffmp, 16
+ jne .generic
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Special case of ncoeff == 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+ movifnidn coeffq, coeffmp
+ movifnidn zbinq, zbinmp
+ mova m0, [zbinq] ; m0 = zbin
+
+ ; Get DC and first 15 AC coeffs - in this special case, that is all.
+%if CONFIG_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+ mova m9, [coeffq]
+ packssdw m9, [coeffq+16] ; m9 = c[i]
+ mova m10, [coeffq+32]
+ packssdw m10, [coeffq+48] ; m10 = c[i]
+%else
+ mova m9, [coeffq] ; m9 = c[i]
+ mova m10, [coeffq+16] ; m10 = c[i]
+%endif
+
+ mov r0, eobmp ; Output pointer
+ mov r1, qcoeffmp ; Output pointer
+ mov r2, dqcoeffmp ; Output pointer
+
+ pxor m5, m5 ; m5 = dedicated zero
+
+ pcmpeqw m4, m4 ; All word lanes -1
+ paddw m0, m4 ; m0 = zbin - 1
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, we just write zeros
+ ; to the outputs and we are done.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .single_nonzero
+
+%if CONFIG_HIGHBITDEPTH
+ mova [r1 ], ymm5
+ mova [r1+32], ymm5
+ mova [r2 ], ymm5
+ mova [r2+32], ymm5
+%else
+ mova [r1], ymm5
+ mova [r2], ymm5
+%endif
+ mov [r0], word 0
+
+ vzeroupper
+ RET
+
+.single_nonzero:
+
+ ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+ movifnidn r4, roundmp
+ movifnidn r5, quantmp
+ mov r3, dequantmp
+ mov r6, shiftmp
+ mova m1, [r4] ; m1 = round
+ mova m2, [r5] ; m2 = quant
+ mova m3, [r3] ; m3 = dequant
+ mova m4, [r6] ; m4 = shift
+
+ mov r3, iscanmp
+
+ DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+%if CONFIG_HIGHBITDEPTH
+ ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [qcoeffq ], m11
+ mova [qcoeffq+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+32], m11
+ mova [qcoeffq+48], m6
+%else
+ mova [qcoeffq ], m8
+ mova [qcoeffq+16], m13
+%endif
+
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+
+%if CONFIG_HIGHBITDEPTH
+ ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [dqcoeffq ], m11
+ mova [dqcoeffq+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+32], m11
+ mova [dqcoeffq+48], m6
+%else
+ mova [dqcoeffq ], m8
+ mova [dqcoeffq+16], m13
+%endif
+
+ mova m6, [iscanq] ; m6 = scan[i]
+ mova m11, [iscanq+16] ; m11 = scan[i]
+
+ pcmpeqw m8, m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m13, m5 ; m13 = c[i] == 0
+ psubw m6, m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m8, m6 ; m8 = max(eob)
+ pandn m13, m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m8, m13
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [eobq], ax
+
+ vzeroupper
+ RET
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Generic case of ncoeff != 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+ qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+ ; Actual quantization loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m0, [zbinq] ; m0 = zbin
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+ mova m3, [r2] ; m3 = dequant
+ pcmpeqw m4, m4 ; All lanes -1
+%ifidn %1, b_32x32
+ psubw m0, m4
+ psubw m1, m4
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ paddw m0, m4 ; m0 = m0 + 1
+
+ mov r2, shiftmp
+ mov r3, qcoeffmp
+ mova m4, [r2] ; m4 = shift
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
+%if CONFIG_HIGHBITDEPTH
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+ lea iscanq, [ iscanq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+%if CONFIG_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [coeffq+ncoeffq*4+ 0]
+ packssdw m9, [coeffq+ncoeffq*4+16]
+ mova m10, [coeffq+ncoeffq*4+32]
+ packssdw m10, [coeffq+ncoeffq*4+48]
+%else
+ mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .first_nonzero
+
+%if CONFIG_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4 ], ymm5
+ mova [qcoeffq+ncoeffq*4+32], ymm5
+ mova [dqcoeffq+ncoeffq*4 ], ymm5
+ mova [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+ mova [qcoeffq+ncoeffq*2], ymm5
+ mova [dqcoeffq+ncoeffq*2], ymm5
+%endif
+
+ add ncoeffq, mmsize
+
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ punpckhqdq m4, m4
+ pxor m8, m8
+
+ jmp .ac_only_loop
+
+.first_nonzero:
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m8
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m8
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i]
+ mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+
+.ac_only_loop:
+
+%if CONFIG_HIGHBITDEPTH
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [coeffq+ncoeffq*4+ 0]
+ packssdw m9, [coeffq+ncoeffq*4+16]
+ mova m10, [coeffq+ncoeffq*4+32]
+ packssdw m10, [coeffq+ncoeffq*4+48]
+%else
+ mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+ ; And just write zeros as the result would be.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .rest_nonzero
+
+%if CONFIG_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4+ 0], ymm5
+ mova [qcoeffq+ncoeffq*4+32], ymm5
+ mova [dqcoeffq+ncoeffq*4+ 0], ymm5
+ mova [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], ymm5
+ mova [dqcoeffq+ncoeffq*2+ 0], ymm5
+%endif
+ add ncoeffq, mmsize
+ jnz .ac_only_loop
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [r2], ax
+ vzeroupper
+ RET
+
+.rest_nonzero:
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m14
+ punpckhwd m6, m14, m6
+ pmovsxwd m11, m14
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m14
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m14
+ punpckhwd m6, m14, m6
+ pmovsxwd m11, m14
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m14
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jnz .ac_only_loop
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [r2], ax
+ vzeroupper
+ RET
+
+ ; Skip-block, i.e. just write all zeroes
+.blank:
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+ qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+
+%if CONFIG_HIGHBITDEPTH
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+
+ neg ncoeffq
+ pxor m7, m7
+
+.blank_loop:
+%if CONFIG_HIGHBITDEPTH
+ mova [dqcoeffq+ncoeffq*4+ 0], ymm7
+ mova [dqcoeffq+ncoeffq*4+32], ymm7
+ mova [qcoeffq+ncoeffq*4+ 0], ymm7
+ mova [qcoeffq+ncoeffq*4+32], ymm7
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], ymm7
+ mova [qcoeffq+ncoeffq*2+ 0], ymm7
+%endif
+ add ncoeffq, mmsize
+ jl .blank_loop
+
+ mov [eobq], word 0
+
+ vzeroupper
+ RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
+
+END
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 000000000..890c1f01e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+#if CONFIG_HIGHBITDEPTH
+ return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+#else
+ return _mm_load_si128((const __m128i *)coeff_ptr);
+#endif
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+ tran_low_t *coeff_ptr) {
+#if CONFIG_HIGHBITDEPTH
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+#else
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
+#endif
+}
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ __m128i zero;
+ (void)scan_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+ if (!skip_block) {
+ __m128i eob;
+ __m128i zbin;
+ __m128i round, quant, dequant, shift;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ __m128i pw_1;
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ pw_1 = _mm_set1_epi16(1);
+ zbin = _mm_sub_epi16(zbin, pw_1);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ __m128i cmp_mask0, cmp_mask1;
+ // Do DC and first 15 AC
+ coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+ coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+ qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+ qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+ store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+ store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ __m128i cmp_mask0, cmp_mask1;
+
+ coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+ coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+ qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+ qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+ qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+ store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+ store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ store_coefficients(zero, dqcoeff_ptr + n_coeffs);
+ store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
+ store_coefficients(zero, qcoeff_ptr + n_coeffs);
+ store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..36b4dddbd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,349 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m0, [zbinq] ; m0 = zbin
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [r2q] ; m3 = dequant
+ psubw m0, [pw_1]
+ mov r2, shiftmp
+ mov r3, qcoeffmp
+ mova m4, [r2] ; m4 = shift
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+%if CONFIG_HIGHBITDEPTH
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+ lea iscanq, [ iscanq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+%if CONFIG_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+%else
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m8
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m8
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+%if CONFIG_HIGHBITDEPTH
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+%else
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+ or r6, r2
+ jz .skip_iter
+%endif
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pxor m11, m11
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m14
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+%if CONFIG_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m14
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+%if CONFIG_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4+ 0], m5
+ mova [qcoeffq+ncoeffq*4+16], m5
+ mova [qcoeffq+ncoeffq*4+32], m5
+ mova [qcoeffq+ncoeffq*4+48], m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m5
+ mova [dqcoeffq+ncoeffq*4+16], m5
+ mova [dqcoeffq+ncoeffq*4+32], m5
+ mova [dqcoeffq+ncoeffq*4+48], m5
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m5
+ mova [qcoeffq+ncoeffq*2+16], m5
+ mova [dqcoeffq+ncoeffq*2+ 0], m5
+ mova [dqcoeffq+ncoeffq*2+16], m5
+%endif
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+%if CONFIG_HIGHBITDEPTH
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+%if CONFIG_HIGHBITDEPTH
+ mova [dqcoeffq+ncoeffq*4+ 0], m7
+ mova [dqcoeffq+ncoeffq*4+16], m7
+ mova [dqcoeffq+ncoeffq*4+32], m7
+ mova [dqcoeffq+ncoeffq*4+48], m7
+ mova [qcoeffq+ncoeffq*4+ 0], m7
+ mova [qcoeffq+ncoeffq*4+16], m7
+ mova [qcoeffq+ncoeffq*4+32], m7
+ mova [qcoeffq+ncoeffq*4+48], m7
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m7
+ mova [dqcoeffq+ncoeffq*2+16], m7
+ mova [qcoeffq+ncoeffq*2+ 0], m7
+ mova [qcoeffq+ncoeffq*2+16], m7
+%endif
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [eobq], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 000000000..e60f518b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h> // AVX2
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ __m256i sum_mlow, sum_mhigh;
+ int i;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+ sum_ref0 = _mm256_set1_epi16(0);
+ sum_ref1 = _mm256_set1_epi16(0);
+ sum_ref2 = _mm256_set1_epi16(0);
+ sum_ref3 = _mm256_set1_epi16(0);
+ for (i = 0; i < 32; i++) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)src);
+ ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+ ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+ ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+ ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+ {
+ __m128i sum;
+ // in sum_ref-i the result is saved in the first 4 bytes
+ // the other 4 bytes are zeroed.
+ // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+ sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+ sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+ sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+ // merge every 64 bit from each sum_ref-i
+ sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+ sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+ // add the low 64 bit to the high 64 bit
+ sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+ // add the low 128 bit to the high 128 bit
+ sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+ _mm256_extractf128_si256(sum_mlow, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum);
+ }
+ _mm256_zeroupper();
+}
+
+void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+ __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+ __m256i ref3_reg, ref3next_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ __m256i sum_mlow, sum_mhigh;
+ int i;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+ sum_ref0 = _mm256_set1_epi16(0);
+ sum_ref1 = _mm256_set1_epi16(0);
+ sum_ref2 = _mm256_set1_epi16(0);
+ sum_ref3 = _mm256_set1_epi16(0);
+ for (i = 0; i < 64; i++) {
+ // load 64 bytes from src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)src);
+ srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+ ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+ ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+ ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
+ ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+ ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+ ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+ ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+ ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+ ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+ {
+ __m128i sum;
+
+ // in sum_ref-i the result is saved in the first 4 bytes
+ // the other 4 bytes are zeroed.
+ // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+ sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+ sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+ sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+ // merge every 64 bit from each sum_ref-i
+ sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+ sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+ // add the low 64 bit to the high 64 bit
+ sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+ // add the low 128 bit to the high 128 bit
+ sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+ _mm256_extractf128_si256(sum_mlow, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum);
+ }
+ _mm256_zeroupper();
+}
+
+void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ const uint8_t *rf[4];
+ uint32_t sum0[4];
+ uint32_t sum1[4];
+
+ rf[0] = ref[0];
+ rf[1] = ref[1];
+ rf[2] = ref[2];
+ rf[3] = ref[3];
+ aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+ src += src_stride << 5;
+ rf[0] += ref_stride << 5;
+ rf[1] += ref_stride << 5;
+ rf[2] += ref_stride << 5;
+ rf[3] += ref_stride << 5;
+ aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+ res[2] = sum0[2] + sum1[2];
+ res[3] = sum0[3] + sum1[3];
+}
+
+void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ const uint8_t *rf[4];
+ uint32_t sum0[4];
+ uint32_t sum1[4];
+ unsigned int half_width = 32;
+
+ rf[0] = ref[0];
+ rf[1] = ref[1];
+ rf[2] = ref[2];
+ rf[3] = ref[3];
+ aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+ src += half_width;
+ rf[0] += half_width;
+ rf[1] += half_width;
+ rf[2] += half_width;
+ rf[3] += half_width;
+ aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+ res[2] = sum0[2] + sum1[2];
+ res[3] = sum0[3] + sum1[3];
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 000000000..8f04ef2f3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,253 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+ movd m0, [srcq +%2]
+%if %1 == 1
+ movd m6, [ref1q+%3]
+ movd m4, [ref2q+%3]
+ movd m7, [ref3q+%3]
+ movd m5, [ref4q+%3]
+ movd m1, [srcq +%4]
+ movd m2, [ref1q+%5]
+ punpckldq m0, m1
+ punpckldq m6, m2
+ movd m1, [ref2q+%5]
+ movd m2, [ref3q+%5]
+ movd m3, [ref4q+%5]
+ punpckldq m4, m1
+ punpckldq m7, m2
+ punpckldq m5, m3
+ movlhps m0, m0
+ movlhps m6, m4
+ movlhps m7, m5
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movd m1, [ref1q+%3]
+ movd m5, [ref1q+%5]
+ movd m2, [ref2q+%3]
+ movd m4, [ref2q+%5]
+ punpckldq m1, m5
+ punpckldq m2, m4
+ movd m3, [ref3q+%3]
+ movd m5, [ref3q+%5]
+ punpckldq m3, m5
+ movd m4, [ref4q+%3]
+ movd m5, [ref4q+%5]
+ punpckldq m4, m5
+ movd m5, [srcq +%4]
+ punpckldq m0, m5
+ movlhps m0, m0
+ movlhps m1, m2
+ movlhps m3, m4
+ psadbw m1, m0
+ psadbw m3, m0
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+ movh m0, [srcq +%2]
+%if %1 == 1
+ movh m4, [ref1q+%3]
+ movh m5, [ref2q+%3]
+ movh m6, [ref3q+%3]
+ movh m7, [ref4q+%3]
+ movhps m0, [srcq +%4]
+ movhps m4, [ref1q+%5]
+ movhps m5, [ref2q+%5]
+ movhps m6, [ref3q+%5]
+ movhps m7, [ref4q+%5]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q+%3]
+ movh m2, [ref2q+%3]
+ movh m3, [ref3q+%3]
+ movhps m0, [srcq +%4]
+ movhps m1, [ref1q+%5]
+ movhps m2, [ref2q+%5]
+ movhps m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movh m1, [ref4q+%3]
+ movhps m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+ ; 1st 16 px
+ mova m0, [srcq +%2]
+%if %1 == 1
+ movu m4, [ref1q+%3]
+ movu m5, [ref2q+%3]
+ movu m6, [ref3q+%3]
+ movu m7, [ref4q+%3]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movu m1, [ref1q+%3]
+ movu m2, [ref2q+%3]
+ movu m3, [ref3q+%3]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%3]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+
+ ; 2nd 16 px
+ mova m0, [srcq +%4]
+ movu m1, [ref1q+%5]
+ movu m2, [ref2q+%5]
+ movu m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+ psadbw m1, m0
+ paddd m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+ PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+ PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+ PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+ PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_128x2x4 5-6 0
+ PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
+ PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+ PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if %1 > 4
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ movifnidn r4, r4mp
+ paddd m4, m5
+ movu [r4], m4
+ RET
+%else
+ movifnidn r4, r4mp
+ pshufd m6, m6, 0x08
+ pshufd m7, m7, 0x08
+ movq [r4+0], m6
+ movq [r4+8], m7
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64, 128
+%endif
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+SADNXN4D 8, 4
+SADNXN4D 4, 8
+SADNXN4D 4, 4
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 000000000..efba61289
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+#define FSAD64_H(h) \
+ unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSAD32_H(h) \
+ unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSAD64 \
+ FSAD64_H(64); \
+ FSAD64_H(32);
+
+#define FSAD32 \
+ FSAD32_H(64); \
+ FSAD32_H(32); \
+ FSAD32_H(16);
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+ unsigned int aom_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG32_H(h) \
+ unsigned int aom_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i, res; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ res = _mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG64 \
+ FSADAVG64_H(64); \
+ FSADAVG64_H(32);
+
+#define FSADAVG32 \
+ FSADAVG32_H(64); \
+ FSADAVG32_H(32); \
+ FSADAVG32_H(16);
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
new file mode 100644
index 000000000..196394379
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+ // input 8 32-bit summation
+ __m128i lo128, hi128;
+ __m256i u = _mm256_srli_si256(*v, 8);
+ u = _mm256_add_epi32(u, *v);
+
+ // 4 32-bit summation
+ hi128 = _mm256_extracti128_si256(u, 1);
+ lo128 = _mm256_castsi256_si128(u);
+ lo128 = _mm_add_epi32(hi128, lo128);
+
+ // 2 32-bit summation
+ hi128 = _mm_srli_si128(lo128, 4);
+ lo128 = _mm_add_epi32(lo128, hi128);
+
+ return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+ // first 4 rows
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ __m256i u0 = _mm256_sub_epi16(s0, r0);
+ __m256i u1 = _mm256_sub_epi16(s1, r1);
+ __m256i u2 = _mm256_sub_epi16(s2, r2);
+ __m256i u3 = _mm256_sub_epi16(s3, r3);
+ __m256i zero = _mm256_setzero_si256();
+ __m256i sum0, sum1;
+
+ u0 = _mm256_abs_epi16(u0);
+ u1 = _mm256_abs_epi16(u1);
+ u2 = _mm256_abs_epi16(u2);
+ u3 = _mm256_abs_epi16(u3);
+
+ sum0 = _mm256_add_epi16(u0, u1);
+ sum0 = _mm256_add_epi16(sum0, u2);
+ sum0 = _mm256_add_epi16(sum0, u3);
+
+ // second 4 rows
+ src_ptr += src_stride << 2;
+ ref_ptr += ref_stride << 2;
+ s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ u0 = _mm256_sub_epi16(s0, r0);
+ u1 = _mm256_sub_epi16(s1, r1);
+ u2 = _mm256_sub_epi16(s2, r2);
+ u3 = _mm256_sub_epi16(s3, r3);
+
+ u0 = _mm256_abs_epi16(u0);
+ u1 = _mm256_abs_epi16(u1);
+ u2 = _mm256_abs_epi16(u2);
+ u3 = _mm256_abs_epi16(u3);
+
+ sum1 = _mm256_add_epi16(u0, u1);
+ sum1 = _mm256_add_epi16(sum1, u2);
+ sum1 = _mm256_add_epi16(sum1, u3);
+
+ // find out the SAD
+ s0 = _mm256_unpacklo_epi16(sum0, zero);
+ s1 = _mm256_unpackhi_epi16(sum0, zero);
+ r0 = _mm256_unpacklo_epi16(sum1, zero);
+ r1 = _mm256_unpackhi_epi16(sum1, zero);
+ s0 = _mm256_add_epi32(s0, s1);
+ r0 = _mm256_add_epi32(r0, r1);
+ sum0 = _mm256_add_epi32(s0, r0);
+ // 8 32-bit summation
+
+ return (unsigned int)get_sad_from_mm256_epi32(&sum0);
+}
+
+unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
+ __m256i sum0;
+ __m256i sum = _mm256_setzero_si256();
+ const __m256i zero = _mm256_setzero_si256();
+ int row = 0;
+
+ // Loop for every 4 rows
+ while (row < 16) {
+ s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ u0 = _mm256_sub_epi16(s0, r0);
+ u1 = _mm256_sub_epi16(s1, r1);
+ u2 = _mm256_sub_epi16(s2, r2);
+ u3 = _mm256_sub_epi16(s3, r3);
+
+ u0 = _mm256_abs_epi16(u0);
+ u1 = _mm256_abs_epi16(u1);
+ u2 = _mm256_abs_epi16(u2);
+ u3 = _mm256_abs_epi16(u3);
+
+ sum0 = _mm256_add_epi16(u0, u1);
+ sum0 = _mm256_add_epi16(sum0, u2);
+ sum0 = _mm256_add_epi16(sum0, u3);
+
+ s0 = _mm256_unpacklo_epi16(sum0, zero);
+ s1 = _mm256_unpackhi_epi16(sum0, zero);
+ sum = _mm256_add_epi32(sum, s0);
+ sum = _mm256_add_epi32(sum, s1);
+ // 8 32-bit summation
+
+ row += 4;
+ src_ptr += src_stride << 2;
+ ref_ptr += ref_stride << 2;
+ }
+ return get_sad_from_mm256_epi32(&sum);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s0, s1, s2, s3, r0, r1, r2, r3;
+ const __m256i zero = _mm256_setzero_si256();
+ int row_sections = 0;
+
+ while (row_sections < 2) {
+ s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+ r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+ if (sec_ptr) {
+ r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r1 = _mm256_avg_epu16(
+ r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r2 = _mm256_avg_epu16(
+ r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r3 = _mm256_avg_epu16(
+ r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ }
+ s0 = _mm256_sub_epi16(s0, r0);
+ s1 = _mm256_sub_epi16(s1, r1);
+ s2 = _mm256_sub_epi16(s2, r2);
+ s3 = _mm256_sub_epi16(s3, r3);
+
+ s0 = _mm256_abs_epi16(s0);
+ s1 = _mm256_abs_epi16(s1);
+ s2 = _mm256_abs_epi16(s2);
+ s3 = _mm256_abs_epi16(s3);
+
+ s0 = _mm256_add_epi16(s0, s1);
+ s0 = _mm256_add_epi16(s0, s2);
+ s0 = _mm256_add_epi16(s0, s3);
+
+ r0 = _mm256_unpacklo_epi16(s0, zero);
+ r1 = _mm256_unpackhi_epi16(s0, zero);
+
+ r0 = _mm256_add_epi32(r0, r1);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+
+ row_sections += 1;
+ src_ptr += src_stride << 1;
+ ref_ptr += ref_stride << 1;
+ if (sec_ptr) sec_ptr += 32 << 1;
+ }
+}
+
+unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 5;
+ ref += ref_stride << 5;
+ sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[8], r[8];
+ const __m256i zero = _mm256_setzero_si256();
+
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+ s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+ s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
+ s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+ r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
+ r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
+
+ if (sec_ptr) {
+ r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ r[4] = _mm256_avg_epu16(
+ r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
+ r[5] = _mm256_avg_epu16(
+ r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
+ r[6] = _mm256_avg_epu16(
+ r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
+ r[7] = _mm256_avg_epu16(
+ r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+ }
+
+ s[0] = _mm256_sub_epi16(s[0], r[0]);
+ s[1] = _mm256_sub_epi16(s[1], r[1]);
+ s[2] = _mm256_sub_epi16(s[2], r[2]);
+ s[3] = _mm256_sub_epi16(s[3], r[3]);
+ s[4] = _mm256_sub_epi16(s[4], r[4]);
+ s[5] = _mm256_sub_epi16(s[5], r[5]);
+ s[6] = _mm256_sub_epi16(s[6], r[6]);
+ s[7] = _mm256_sub_epi16(s[7], r[7]);
+
+ s[0] = _mm256_abs_epi16(s[0]);
+ s[1] = _mm256_abs_epi16(s[1]);
+ s[2] = _mm256_abs_epi16(s[2]);
+ s[3] = _mm256_abs_epi16(s[3]);
+ s[4] = _mm256_abs_epi16(s[4]);
+ s[5] = _mm256_abs_epi16(s[5]);
+ s[6] = _mm256_abs_epi16(s[6]);
+ s[7] = _mm256_abs_epi16(s[7]);
+
+ s[0] = _mm256_add_epi16(s[0], s[1]);
+ s[0] = _mm256_add_epi16(s[0], s[2]);
+ s[0] = _mm256_add_epi16(s[0], s[3]);
+
+ s[4] = _mm256_add_epi16(s[4], s[5]);
+ s[4] = _mm256_add_epi16(s[4], s[6]);
+ s[4] = _mm256_add_epi16(s[4], s[7]);
+
+ r[0] = _mm256_unpacklo_epi16(s[0], zero);
+ r[1] = _mm256_unpackhi_epi16(s[0], zero);
+ r[2] = _mm256_unpacklo_epi16(s[4], zero);
+ r[3] = _mm256_unpackhi_epi16(s[4], zero);
+
+ r[0] = _mm256_add_epi32(r[0], r[1]);
+ r[0] = _mm256_add_epi32(r[0], r[2]);
+ r[0] = _mm256_add_epi32(r[0], r[3]);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 16) {
+ sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 5;
+ ref += ref_stride << 5;
+ sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+
+#if CONFIG_EXT_PARTITION
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[8], r[8];
+ const __m256i zero = _mm256_setzero_si256();
+
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+ s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
+ s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
+ s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
+ s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
+ r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
+ r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
+ r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
+
+ if (sec_ptr) {
+ r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ r[4] = _mm256_avg_epu16(
+ r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
+ r[5] = _mm256_avg_epu16(
+ r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
+ r[6] = _mm256_avg_epu16(
+ r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
+ r[7] = _mm256_avg_epu16(
+ r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+ }
+
+ s[0] = _mm256_sub_epi16(s[0], r[0]);
+ s[1] = _mm256_sub_epi16(s[1], r[1]);
+ s[2] = _mm256_sub_epi16(s[2], r[2]);
+ s[3] = _mm256_sub_epi16(s[3], r[3]);
+ s[4] = _mm256_sub_epi16(s[4], r[4]);
+ s[5] = _mm256_sub_epi16(s[5], r[5]);
+ s[6] = _mm256_sub_epi16(s[6], r[6]);
+ s[7] = _mm256_sub_epi16(s[7], r[7]);
+
+ s[0] = _mm256_abs_epi16(s[0]);
+ s[1] = _mm256_abs_epi16(s[1]);
+ s[2] = _mm256_abs_epi16(s[2]);
+ s[3] = _mm256_abs_epi16(s[3]);
+ s[4] = _mm256_abs_epi16(s[4]);
+ s[5] = _mm256_abs_epi16(s[5]);
+ s[6] = _mm256_abs_epi16(s[6]);
+ s[7] = _mm256_abs_epi16(s[7]);
+
+ s[0] = _mm256_add_epi16(s[0], s[1]);
+ s[0] = _mm256_add_epi16(s[0], s[2]);
+ s[0] = _mm256_add_epi16(s[0], s[3]);
+
+ s[4] = _mm256_add_epi16(s[4], s[5]);
+ s[4] = _mm256_add_epi16(s[4], s[6]);
+ s[4] = _mm256_add_epi16(s[4], s[7]);
+
+ r[0] = _mm256_unpacklo_epi16(s[0], zero);
+ r[1] = _mm256_unpackhi_epi16(s[0], zero);
+ r[2] = _mm256_unpacklo_epi16(s[4], zero);
+ r[3] = _mm256_unpackhi_epi16(s[4], zero);
+
+ r[0] = _mm256_add_epi32(r[0], r[1]);
+ r[0] = _mm256_add_epi32(r[0], r[2]);
+ r[0] = _mm256_add_epi32(r[0], r[3]);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ int row = 0;
+ while (row < 64) {
+ sad128x1(srcp, refp, NULL, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ row += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 6;
+ ref += ref_stride << 6;
+ sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+
+unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
+ src += src_stride << 6;
+ ref += ref_stride << 6;
+ sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
+ return sum;
+}
+#endif // CONFIG_EXT_PARTITION
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s0, s1, s2, s3, r0, r1, r2, r3;
+ const __m256i zero = _mm256_setzero_si256();
+
+ s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ if (sec_ptr) {
+ r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r1 = _mm256_avg_epu16(r1,
+ _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r2 = _mm256_avg_epu16(r2,
+ _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r3 = _mm256_avg_epu16(r3,
+ _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ }
+
+ s0 = _mm256_sub_epi16(s0, r0);
+ s1 = _mm256_sub_epi16(s1, r1);
+ s2 = _mm256_sub_epi16(s2, r2);
+ s3 = _mm256_sub_epi16(s3, r3);
+
+ s0 = _mm256_abs_epi16(s0);
+ s1 = _mm256_abs_epi16(s1);
+ s2 = _mm256_abs_epi16(s2);
+ s3 = _mm256_abs_epi16(s3);
+
+ s0 = _mm256_add_epi16(s0, s1);
+ s0 = _mm256_add_epi16(s0, s2);
+ s0 = _mm256_add_epi16(s0, s3);
+
+ r0 = _mm256_unpacklo_epi16(s0, zero);
+ r1 = _mm256_unpackhi_epi16(s0, zero);
+
+ r0 = _mm256_add_epi32(r0, r1);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+ // Next 4 rows
+ srcp += src_stride << 2;
+ refp += ref_stride << 2;
+ secp += 64;
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 3;
+ uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 32 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 16) {
+ sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 64 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+#if CONFIG_EXT_PARTITION
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 6;
+ uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ int row = 0;
+ while (row < 64) {
+ sad128x1(srcp, refp, secp, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ secp += 16 << 3;
+ row += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int sum;
+ const int left_shift = 6;
+
+ sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 128 << left_shift;
+ sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+#endif // CONFIG_EXT_PARTITION
+
+// SAD 4D
+// Combine 4 __m256i vectors to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+ uint32_t *res) {
+ __m256i u0, u1, u2, u3;
+ const __m256i mask = _mm256_set1_epi64x(UINT32_MAX);
+ __m128i sad;
+
+ // 8 32-bit summation
+ u0 = _mm256_srli_si256(v[0], 4);
+ u1 = _mm256_srli_si256(v[1], 4);
+ u2 = _mm256_srli_si256(v[2], 4);
+ u3 = _mm256_srli_si256(v[3], 4);
+
+ u0 = _mm256_add_epi32(u0, v[0]);
+ u1 = _mm256_add_epi32(u1, v[1]);
+ u2 = _mm256_add_epi32(u2, v[2]);
+ u3 = _mm256_add_epi32(u3, v[3]);
+
+ u0 = _mm256_and_si256(u0, mask);
+ u1 = _mm256_and_si256(u1, mask);
+ u2 = _mm256_and_si256(u2, mask);
+ u3 = _mm256_and_si256(u3, mask);
+ // 4 32-bit summation, evenly positioned
+
+ u1 = _mm256_slli_si256(u1, 4);
+ u3 = _mm256_slli_si256(u3, 4);
+
+ u0 = _mm256_or_si256(u0, u1);
+ u2 = _mm256_or_si256(u2, u3);
+ // 8 32-bit summation, interleaved
+
+ u1 = _mm256_unpacklo_epi64(u0, u2);
+ u3 = _mm256_unpackhi_epi64(u0, u2);
+
+ u0 = _mm256_add_epi32(u1, u3);
+ sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+ _mm256_castsi256_si128(u0));
+ _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+ const uint16_t *ref[]) {
+ ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+ ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+ ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+ ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+ s[0] = _mm256_setzero_si256();
+ s[1] = _mm256_setzero_si256();
+ s[2] = _mm256_setzero_si256();
+ s[3] = _mm256_setzero_si256();
+}
+
+void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ const int shift_for_4_rows = 2;
+ int i;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < 4; ++i) {
+ srcp = keep;
+ sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ srcp += src_stride << shift_for_4_rows;
+ refp[i] += ref_stride << shift_for_4_rows;
+ sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first8rows[4];
+ uint32_t second8rows[4];
+ const uint8_t *ref[4];
+ const int shift_for_8_rows = 3;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
+ src += src_stride << shift_for_8_rows;
+ ref[0] += ref_stride << shift_for_8_rows;
+ ref[1] += ref_stride << shift_for_8_rows;
+ ref[2] += ref_stride << shift_for_8_rows;
+ ref[3] += ref_stride << shift_for_8_rows;
+ aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
+ sad_array[0] = first8rows[0] + second8rows[0];
+ sad_array[1] = first8rows[1] + second8rows[1];
+ sad_array[2] = first8rows[2] + second8rows[2];
+ sad_array[3] = first8rows[3] + second8rows[3];
+}
+
+void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 4;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ const int shift_for_4_rows = 2;
+ int i;
+ int rows_section;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < 4; ++i) {
+ srcp = keep;
+ rows_section = 0;
+ while (rows_section < 4) {
+ sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ srcp += src_stride << shift_for_4_rows;
+ refp[i] += ref_stride << shift_for_4_rows;
+ rows_section++;
+ }
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 4;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 5;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ const int shift_for_rows = 1;
+ int i;
+ int rows_section;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < 4; ++i) {
+ srcp = keep;
+ rows_section = 0;
+ while (rows_section < 16) {
+ sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+ srcp += src_stride << shift_for_rows;
+ refp[i] += ref_stride << shift_for_rows;
+ rows_section++;
+ }
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 5;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+
+#if CONFIG_EXT_PARTITION
+void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 6;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ int i;
+ int rows_section;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < 4; ++i) {
+ srcp = keep;
+ rows_section = 0;
+ while (rows_section < 64) {
+ sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+ srcp += src_stride;
+ refp[i] += ref_stride;
+ rows_section++;
+ }
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[],
+ int ref_stride, uint32_t *sad_array) {
+ uint32_t first_half[4];
+ uint32_t second_half[4];
+ const uint8_t *ref[4];
+ const int shift_for_rows = 6;
+
+ ref[0] = ref_array[0];
+ ref[1] = ref_array[1];
+ ref[2] = ref_array[2];
+ ref[3] = ref_array[3];
+
+ aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+ src += src_stride << shift_for_rows;
+ ref[0] += ref_stride << shift_for_rows;
+ ref[1] += ref_stride << shift_for_rows;
+ ref[2] += ref_stride << shift_for_rows;
+ ref[3] += ref_stride << shift_for_rows;
+ aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+ sad_array[0] = first_half[0] + second_half[0];
+ sad_array[1] = first_half[1] + second_half[1];
+ sad_array[2] = first_half[2] + second_half[2];
+ sad_array[3] = first_half[3] + second_half[3];
+}
+#endif // CONFIG_EXT_PARTITION
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 000000000..4419c65b2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "./aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ __m256i s1, s2, r1, r2;
+ __m256i sum = _mm256_setzero_si256();
+ __m128i sum_i128;
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+ s2 = _mm256_sad_epu8(
+ r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+ ref_ptr += ref_stride << 1;
+ src_ptr += src_stride << 1;
+ }
+
+ sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+ sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+ _mm256_castsi256_si128(sum));
+ return _mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 32;
+ uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 5;
+ ref_ptr += ref_stride << 5;
+ sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+static void sad64x64x4d(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ __m128i *res) {
+ uint32_t sum[4];
+ aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
+ *res = _mm_loadu_si128((const __m128i *)sum);
+}
+
+void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ __m128i sum0, sum1;
+ const uint8_t *rf[4];
+
+ rf[0] = ref[0];
+ rf[1] = ref[1];
+ rf[2] = ref[2];
+ rf[3] = ref[3];
+ sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
+ src += src_stride << 6;
+ rf[0] += ref_stride << 6;
+ rf[1] += ref_stride << 6;
+ rf[2] += ref_stride << 6;
+ rf[3] += ref_stride << 6;
+ sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
+ sum0 = _mm_add_epi32(sum0, sum1);
+ _mm_storeu_si128((__m128i *)res, sum0);
+}
+
+void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ __m128i sum0, sum1;
+ unsigned int half_width = 64;
+ const uint8_t *rf[4];
+
+ rf[0] = ref[0];
+ rf[1] = ref[1];
+ rf[2] = ref[2];
+ rf[3] = ref[3];
+ sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
+ src += half_width;
+ rf[0] += half_width;
+ rf[1] += half_width;
+ rf[2] += half_width;
+ rf[3] += half_width;
+ sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
+ sum0 = _mm_add_epi32(sum0, sum1);
+ _mm_storeu_si128((__m128i *)res, sum0);
+}
+
+void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ const uint8_t *rf[4];
+ uint32_t sum0[4];
+ uint32_t sum1[4];
+
+ rf[0] = ref[0];
+ rf[1] = ref[1];
+ rf[2] = ref[2];
+ rf[3] = ref[3];
+ aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+ src += src_stride << 6;
+ rf[0] += ref_stride << 6;
+ rf[1] += ref_stride << 6;
+ rf[2] += ref_stride << 6;
+ rf[3] += ref_stride << 6;
+ aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+ res[2] = sum0[2] + sum1[2];
+ res[3] = sum0[3] + sum1[3];
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int h, const uint8_t *second_pred,
+ const int second_pred_stride) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ ref1_reg = _mm256_avg_epu8(
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+ ref2_reg = _mm256_avg_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ second_pred += second_pred_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+
+ return res;
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 64 << 6;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ second_pred += half_width;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+ ref_stride, second_pred);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 128 << 6;
+ sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+ second_pred);
+ return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 000000000..e45457a57
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,345 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+%if CONFIG_EXT_PARTITION
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+ SAD_FN 128, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*4]
+ pavgb m2, [second_predq+mmsize*5]
+ pavgb m3, [second_predq+mmsize*6]
+ pavgb m4, [second_predq+mmsize*7]
+ lea second_predq, [second_predq+mmsize*8]
+%endif
+ psadbw m1, [srcq+64]
+ psadbw m2, [srcq+80]
+ psadbw m3, [srcq+96]
+ psadbw m4, [srcq+112]
+
+ add refq, ref_strideq
+ add srcq, src_strideq
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ sub n_rowsd, 1
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128 ; sad128x128_sse2
+SAD128XN 128, 1 ; sad128x128_avg_sse2
+SAD128XN 64 ; sad128x64_sse2
+SAD128XN 64, 1 ; sad128x64_avg_sse2
+%endif
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SAD64XN 128 ; sad64x128_sse2
+SAD64XN 128, 1 ; sad64x128_avg_sse2
+%endif
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
+ mov n_rowsd, %1/2
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movlhps m1, m3
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ lea second_predq, [second_predq+mmsize*1]
+%endif
+ movd m2, [srcq]
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m3, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m3
+ movlhps m2, m4
+ psadbw m1, m2
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm
new file mode 100644
index 000000000..f6c27c855
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse3.asm
@@ -0,0 +1,377 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %if LIBAOM_YASM_WIN64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %if LIBAOM_YASM_WIN64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm5, XMMWORD PTR [%3]
+ lddqu xmm6, XMMWORD PTR [%3+1]
+ lddqu xmm7, XMMWORD PTR [%3+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%3+1]
+ lddqu xmm3, XMMWORD PTR [%3+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%4]
+ lddqu xmm1, XMMWORD PTR [%3+%5]
+ lddqu xmm2, XMMWORD PTR [%3+%5+1]
+ lddqu xmm3, XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm5, QWORD PTR [%3]
+ movq mm6, QWORD PTR [%3+1]
+ movq mm7, QWORD PTR [%3+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%3+1]
+ movq mm3, QWORD PTR [%3+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [%2+%4]
+ movq mm1, QWORD PTR [%3+%5]
+ movq mm2, QWORD PTR [%3+%5+1]
+ movq mm3, QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+;void int aom_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad16x16x3_sse3) PRIVATE
+sym(aom_sad16x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int aom_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad16x8x3_sse3) PRIVATE
+sym(aom_sad16x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int aom_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad8x16x3_sse3) PRIVATE
+sym(aom_sad8x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int aom_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad8x8x3_sse3) PRIVATE
+sym(aom_sad8x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int aom_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad4x4x3_sse3) PRIVATE
+sym(aom_sad4x4x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [ref_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [ref_ptr+1]
+ movd mm5, DWORD PTR [ref_ptr+2]
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [ref_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, DWORD PTR [ref_ptr+1]
+ movd mm7, DWORD PTR [ref_ptr+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rcx, result_ptr
+
+ punpckldq mm1, mm3
+
+ movq [rcx], mm1
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm
new file mode 100644
index 000000000..5e9c75845
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse4.asm
@@ -0,0 +1,362 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro WRITE_AS_INTS 0
+ mov rdi, arg(4) ;Results
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ movdqa [rdi], xmm1
+ movdqa [rdi + 16], xmm2
+%endmacro
+
+;void aom_sad16x16x8_sse4_1(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(aom_sad16x16x8_sse4_1) PRIVATE
+sym(aom_sad16x16x8_sse4_1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ WRITE_AS_INTS
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_sad16x8x8_sse4_1(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(aom_sad16x8x8_sse4_1) PRIVATE
+sym(aom_sad16x8x8_sse4_1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ WRITE_AS_INTS
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_sad8x8x8_sse4_1(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(aom_sad8x8x8_sse4_1) PRIVATE
+sym(aom_sad8x8x8_sse4_1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ WRITE_AS_INTS
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_sad8x16x8_sse4_1(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(aom_sad8x16x8_sse4_1) PRIVATE
+sym(aom_sad8x16x8_sse4_1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ WRITE_AS_INTS
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void aom_sad4x4x8_sse4_1(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(aom_sad4x4x8_sse4_1) PRIVATE
+sym(aom_sad4x4x8_sse4_1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ WRITE_AS_INTS
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm
new file mode 100644
index 000000000..96b64b040
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_ssse3.asm
@@ -0,0 +1,373 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm7, XMMWORD PTR [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm3, XMMWORD PTR [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ movdqa xmm4, XMMWORD PTR [rdi+rdx]
+ movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int aom_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad16x16x3_ssse3) PRIVATE
+sym(aom_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .aom_sad16x16x3_ssse3_skiptable
+.aom_sad16x16x3_ssse3_jumptable:
+ dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump
+ dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump
+.aom_sad16x16x3_ssse3_skiptable:
+
+ call .aom_sad16x16x3_ssse3_do_jump
+.aom_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3
+
+.aom_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.aom_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int aom_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(aom_sad16x8x3_ssse3) PRIVATE
+sym(aom_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .aom_sad16x8x3_ssse3_skiptable
+.aom_sad16x8x3_ssse3_jumptable:
+ dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump
+ dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump
+.aom_sad16x8x3_ssse3_skiptable:
+
+ call .aom_sad16x8x3_ssse3_do_jump
+.aom_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3
+
+.aom_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.aom_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
new file mode 100644
index 000000000..aa70106c8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,219 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_16x16_sse2) PRIVATE
+sym(aom_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_8x8_sse2) PRIVATE
+sym(aom_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 000000000..d3feb7ec0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1489 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshuflw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse, \
+ g_bilin_filter, g_pw_8
+ %define block_height heightd
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define block_height heightd
+ %define sec_str sec_strideq
+ %else
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar block_height, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+%endif
+
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%endif
+ movx m1, [dstq]
+%if %1 > 4
+ movlhps m0, m2
+%else ; 4xh
+ punpckldq m0, m2
+%endif
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [secq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonzero:
+ cmp x_offsetd, 4
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
+%endif
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+ movx m1, [dstq]
+ pavgb m0, m4
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movx m2, [srcq]
+ movx m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ movx m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movx m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%endif
+ pavgb m2, m3
+%if %1 > 4
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; 4xh
+ punpckldq m0, m2
+ pshuflw m4, m2, 0xe
+%endif
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movx m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+ paddw m4, m3
+ movx m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movx m4, [srcq]
+ movx m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movx m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movx m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+ STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 000000000..7bd5b23ad
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,150 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+%if CONFIG_EXT_PARTITION
+ cmp colsd, 64
+ je .case_64
+%endif
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+%if CONFIG_EXT_PARTITION
+ mov pred_str, pred_stridemp
+.loop_128:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
+ loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ sub rowsd, 1
+ jnz .loop_128
+ RET
+
+.case_64:
+%endif
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 000000000..6be99fbca
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "./aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
+ int stride) {
+ const __m128i v_val_0_w =
+ _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ const __m128i v_sum_d =
+ _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+
+ return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+static uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ int r, c;
+
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ __m128i v_acc_q = _mm_setzero_si128();
+
+ for (r = 0; r < height; r += 8) {
+ __m128i v_acc_d = _mm_setzero_si128();
+
+ for (c = 0; c < width; c += 8) {
+ const int16_t *b = src + c;
+
+ const __m128i v_val_0_w =
+ _mm_load_si128((const __m128i *)(b + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_load_si128((const __m128i *)(b + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_load_si128((const __m128i *)(b + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_load_si128((const __m128i *)(b + 3 * stride));
+ const __m128i v_val_4_w =
+ _mm_load_si128((const __m128i *)(b + 4 * stride));
+ const __m128i v_val_5_w =
+ _mm_load_si128((const __m128i *)(b + 5 * stride));
+ const __m128i v_val_6_w =
+ _mm_load_si128((const __m128i *)(b + 6 * stride));
+ const __m128i v_val_7_w =
+ _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+ }
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 8 * stride;
+ }
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+ return tmp;
+ }
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ // 4 elements per row only requires half an XMM register, so this
+ // must be a special case, but also note that over 75% of all calls
+ // are with size == 4, so it is also the common case.
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) {
+ // Generic case
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ __m128i v_acc0_q = _mm_setzero_si128();
+ __m128i v_acc1_q = _mm_setzero_si128();
+
+ const int16_t *const end = src + n;
+
+ assert(n % 64 == 0);
+
+ while (src < end) {
+ const __m128i v_val_0_w = xx_load_128(src);
+ const __m128i v_val_1_w = xx_load_128(src + 8);
+ const __m128i v_val_2_w = xx_load_128(src + 16);
+ const __m128i v_val_3_w = xx_load_128(src + 24);
+ const __m128i v_val_4_w = xx_load_128(src + 32);
+ const __m128i v_val_5_w = xx_load_128(src + 40);
+ const __m128i v_val_6_w = xx_load_128(src + 48);
+ const __m128i v_val_7_w = xx_load_128(src + 56);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+ v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+ src += 64;
+ }
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
+ return tmp;
+ }
+#endif
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+ if (n % 64 == 0) {
+ return aom_sum_squares_i16_64n_sse2(src, n);
+ } else if (n > 64) {
+ int k = n & ~(64 - 1);
+ return aom_sum_squares_i16_64n_sse2(src, k) +
+ aom_sum_squares_i16_c(src + k, n - k);
+ } else {
+ return aom_sum_squares_i16_c(src, n);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
new file mode 100644
index 000000000..bef606dae
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SYNONYMS_H_
+#define AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+ return _mm_cvtsi32_si128(*(const uint32_t *)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+ *(uint32_t *)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+ _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i *)a, v);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+ const __m128i v_tmp_d =
+ _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#ifdef __SSSE3__
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+ v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+ return _mm_cvtsi128_si64(v_q);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_q);
+ return tmp;
+ }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+ const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+ const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+ const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+ return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif // __SSSE3__
+
+#endif // AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 000000000..39e9b8e2a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
+#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
+
+#include <immintrin.h>
+
+#include "aom_dsp/txfm_common.h"
+
+#define pair256_set_epi16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair256_set_epi32(a, b) \
+ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+ (int)(b), (int)(a))
+
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+ const __m256i control = _mm256_set_epi16(
+ 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+ 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+ __m256i v = _mm256_shuffle_epi8(*u, control);
+ *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i y0 = _mm256_madd_epi16(a0, cospi);
+ __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+ y0 = _mm256_add_epi32(y0, dct_rounding);
+ y1 = _mm256_add_epi32(y1, dct_rounding);
+ y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+ y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+ return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+ const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i u0, u1;
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_slli_epi16(in[i], 1);
+
+ u0 = _mm256_unpacklo_epi16(zero, in[i]);
+ u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+ u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+ u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+ u0 = _mm256_add_epi32(u0, dct_const_rounding);
+ u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+ u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+ u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+ in[i] = _mm256_packs_epi32(u0, u1);
+ i++;
+ }
+}
+
+#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
new file mode 100644
index 000000000..e4ac56339
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_HIGHBITDEPTH
+}
+
+#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 000000000..4257d8b9c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define dual_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+ const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+ const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+ return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#if CONFIG_EXT_TX
+// Identity transform (both forward and inverse).
+static INLINE void idtx16_8col(__m128i *in) {
+ const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+ const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+ in[0] = _mm_slli_epi16(in[0], 1);
+ in[1] = _mm_slli_epi16(in[1], 1);
+ in[2] = _mm_slli_epi16(in[2], 1);
+ in[3] = _mm_slli_epi16(in[3], 1);
+ in[4] = _mm_slli_epi16(in[4], 1);
+ in[5] = _mm_slli_epi16(in[5], 1);
+ in[6] = _mm_slli_epi16(in[6], 1);
+ in[7] = _mm_slli_epi16(in[7], 1);
+ in[8] = _mm_slli_epi16(in[8], 1);
+ in[9] = _mm_slli_epi16(in[9], 1);
+ in[10] = _mm_slli_epi16(in[10], 1);
+ in[11] = _mm_slli_epi16(in[11], 1);
+ in[12] = _mm_slli_epi16(in[12], 1);
+ in[13] = _mm_slli_epi16(in[13], 1);
+ in[14] = _mm_slli_epi16(in[14], 1);
+ in[15] = _mm_slli_epi16(in[15], 1);
+
+ v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+ v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+ v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+ v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+ v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+ v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+ v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+ v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+ u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+ u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+ u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+ u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+ u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+ u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+ u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+ u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+ x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+ x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+ x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+ x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+ x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+ x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+ x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+ x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+ y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+ y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+ y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+ y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+ y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+ y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+ y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+ y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+ v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+ v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+ v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+ v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+ v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+ v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+ v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+ v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+ x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+ x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+ x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+ x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+ x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+ x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+ x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+ x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+ u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+ u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+ u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+ u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+ u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+ u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+ u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+ u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+ y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+ y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+ y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+ y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+ y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+ y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+ y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+ y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+ v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+ x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+ x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+ x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+ x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+ x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+ x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+ x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+ y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+ y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+ y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+ y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+ y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+ y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+ y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+ x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+ x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+ x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+ x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+ x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+ x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+ x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+ u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+ y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+ y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+ y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+ y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+ y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+ y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+ y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(v0, x0);
+ in[1] = _mm_packs_epi32(v1, x1);
+ in[2] = _mm_packs_epi32(v2, x2);
+ in[3] = _mm_packs_epi32(v3, x3);
+ in[4] = _mm_packs_epi32(v4, x4);
+ in[5] = _mm_packs_epi32(v5, x5);
+ in[6] = _mm_packs_epi32(v6, x6);
+ in[7] = _mm_packs_epi32(v7, x7);
+
+ in[8] = _mm_packs_epi32(u0, y0);
+ in[9] = _mm_packs_epi32(u1, y1);
+ in[10] = _mm_packs_epi32(u2, y2);
+ in[11] = _mm_packs_epi32(u3, y3);
+ in[12] = _mm_packs_epi32(u4, y4);
+ in[13] = _mm_packs_epi32(u5, y5);
+ in[14] = _mm_packs_epi32(u6, y6);
+ in[15] = _mm_packs_epi32(u7, y7);
+}
+#endif // CONFIG_EXT_TX
+
+static INLINE void scale_sqrt2_8x4(__m128i *in) {
+ // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
+ // consecutive elements.
+ const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
+
+ const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+ const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+ const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+ const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+ const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+ const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+ const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+ const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+
+ const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+ const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+ const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+ const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+ const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+ const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+
+ in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+ in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+ in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+ in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x8(__m128i *in) {
+ // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
+ // for each element.
+ const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
+
+ const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+ const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+ const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+ const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+ const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+ const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+ const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+ const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+ const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
+ const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
+ const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
+ const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
+ const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
+ const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
+ const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
+ const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
+
+ const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+ const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+ const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+ const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+ const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+ const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+ const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
+ const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
+ const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
+ const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
+ const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
+ const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
+ const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
+ const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
+
+ in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+ in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+ in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+ in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+ in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
+ in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
+ in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
+ in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x16(__m128i *in) {
+ scale_sqrt2_8x8(in);
+ scale_sqrt2_8x8(in + 8);
+}
+
+#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 000000000..18a70dffe
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "./aom_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, unsigned int *sse,
+ int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w, int h,
+ unsigned int *sse, int *sum, get_var_avx2 var_fn,
+ int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += 16) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ unsigned int variance;
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_get16x16var_avx2, 16);
+
+ variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+ _mm256_zeroupper();
+ return *sse;
+}
+
+unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ unsigned int variance;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+ aom_get32x32var_avx2, 32);
+
+ variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ unsigned int variance;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+ aom_get32x32var_avx2, 32);
+
+ variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ unsigned int variance;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+ aom_get32x32var_avx2, 32);
+
+ variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ unsigned int variance;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+ aom_get32x32var_avx2, 32);
+
+ variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse);
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+ int height, unsigned int *sseptr);
+
+unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
+ int src_stride, int x_offset,
+ int y_offset, const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = aom_sub_pixel_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
+ unsigned int sse2;
+ const int se2 =
+ aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+ dst + 32, dst_stride, 64, &sse2);
+ const int se = se1 + se2;
+ unsigned int variance;
+ *sse = sse1 + sse2;
+
+ variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
+ int src_stride, int x_offset,
+ int y_offset, const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ const int se = aom_sub_pixel_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
+
+ const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_sub_pixel_avg_variance64x64_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+ unsigned int sse1;
+ const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
+ unsigned int sse2;
+ const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
+ 64, 64, &sse2);
+ const int se = se1 + se2;
+ unsigned int variance;
+
+ *sse = sse1 + sse2;
+
+ variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
+ _mm256_zeroupper();
+ return variance;
+}
+
+unsigned int aom_sub_pixel_avg_variance32x32_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+ // Process 32 elements in parallel.
+ const int se = aom_sub_pixel_avg_variance32xh_avx2(
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
+
+ const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
+ _mm256_zeroupper();
+ return variance;
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 000000000..999b541e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+/* clang-format on */
+
+void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *SSE, int *Sum) {
+ __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+ __m256i ref_expand_high, madd_low, madd_high;
+ unsigned int i, src_2strides, ref_2strides;
+ __m256i zero_reg = _mm256_set1_epi16(0);
+ __m256i sum_ref_src = _mm256_set1_epi16(0);
+ __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+ // processing two strides in a 256 bit register reducing the number
+ // of loop stride by half (comparing to the sse2 code)
+ src_2strides = source_stride << 1;
+ ref_2strides = recon_stride << 1;
+ for (i = 0; i < 8; i++) {
+ src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
+ src = _mm256_inserti128_si256(
+ src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
+
+ ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+ ref = _mm256_inserti128_si256(
+ ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
+
+ // expanding to 16 bit each lane
+ src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+ src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+ ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+ ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+ // src-ref
+ src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+ src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+ // madd low (src - ref)
+ madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+ // add high to low
+ src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+ // madd high (src - ref)
+ madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+ sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+ // add high to low
+ madd_ref_src =
+ _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
+
+ src_ptr += src_2strides;
+ ref_ptr += ref_2strides;
+ }
+
+ {
+ __m128i sum_res, madd_res;
+ __m128i expand_sum_low, expand_sum_high, expand_sum;
+ __m128i expand_madd_low, expand_madd_high, expand_madd;
+ __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+ // extract the low lane and add it to the high lane
+ sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+ _mm256_extractf128_si256(sum_ref_src, 1));
+
+ madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+ _mm256_extractf128_si256(madd_ref_src, 1));
+
+ // padding each 2 bytes with another 2 zeroed bytes
+ expand_sum_low =
+ _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
+ expand_sum_high =
+ _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
+
+ // shifting the sign 16 bits right
+ expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+ expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+
+ expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+
+ // expand each 32 bits of the madd result to 64 bits
+ expand_madd_low =
+ _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
+ expand_madd_high =
+ _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
+
+ expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+
+ ex_expand_sum_low =
+ _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
+ ex_expand_sum_high =
+ _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
+
+ ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+ // shift 8 bytes eight
+ madd_res = _mm_srli_si128(expand_madd, 8);
+ sum_res = _mm_srli_si128(ex_expand_sum, 8);
+
+ madd_res = _mm_add_epi32(madd_res, expand_madd);
+ sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+
+ *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
+
+ *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
+ }
+ _mm256_zeroupper();
+}
+
+void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *SSE, int *Sum) {
+ __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+ __m256i ref_expand_high, madd_low, madd_high;
+ unsigned int i;
+ __m256i zero_reg = _mm256_set1_epi16(0);
+ __m256i sum_ref_src = _mm256_set1_epi16(0);
+ __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+ // processing 32 elements in parallel
+ for (i = 0; i < 16; i++) {
+ src = _mm256_loadu_si256((__m256i const *)(src_ptr));
+
+ ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
+
+ // expanding to 16 bit each lane
+ src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+ src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+ ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+ ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+ // src-ref
+ src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+ src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+ // madd low (src - ref)
+ madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+ // add high to low
+ src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+ // madd high (src - ref)
+ madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+ sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+ // add high to low
+ madd_ref_src =
+ _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+
+ {
+ __m256i expand_sum_low, expand_sum_high, expand_sum;
+ __m256i expand_madd_low, expand_madd_high, expand_madd;
+ __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+ // padding each 2 bytes with another 2 zeroed bytes
+ expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+ expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+
+ // shifting the sign 16 bits right
+ expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+ expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+
+ expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+
+ // expand each 32 bits of the madd result to 64 bits
+ expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+ expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+
+ expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+
+ ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+ ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+
+ ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+ // shift 8 bytes eight
+ madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+ sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+
+ madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+ sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+
+ // extract the low lane and the high lane and add the results
+ *((int *)SSE) =
+ _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+
+ *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+ }
+ _mm256_zeroupper();
+}
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+ /* load source and destination */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = 0 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, src_stride)
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, src_stride)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ // x_offset = 8 and y_offset = 0
+ } else if (x_offset == 8) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = 8 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg, src_avg;
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height; i++) {
+ src_avg = src_reg;
+ src += src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // average between previous average to current average
+ src_avg = _mm256_avg_epu8(src_avg, src_reg);
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ // save current source average
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ // x_offset = 8 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg, src_avg;
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src += src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ MERGE_WITH_SRC(src_avg, src_reg)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ dst += dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ MERGE_NEXT_SRC(src_reg, 1)
+
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(xfilter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // merge previous pack to current pack source
+ MERGE_WITH_SRC(src_pack, src_reg)
+ // filter the source
+ FILTER_SRC(yfilter)
+ src_pack = src_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ }
+ }
+ CALC_SUM_AND_SSE
+ _mm256_zeroupper();
+ return sum;
+}
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+ int height, unsigned int *sse) {
+ __m256i sec_reg;
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec += sec_stride;
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (y_offset == 8) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, src_stride)
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec += sec_stride;
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, src_stride)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec += sec_stride;
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ // x_offset = 8 and y_offset = 0
+ } else if (x_offset == 8) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec += sec_stride;
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = 8 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg, src_avg;
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src += src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // average between previous average to current average
+ src_avg = _mm256_avg_epu8(src_avg, src_reg);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+ sec += sec_stride;
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ // x_offset = 8 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg, src_avg;
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src += src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ MERGE_WITH_SRC(src_avg, src_reg)
+ FILTER_SRC(filter)
+ src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ sec += sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ sec += sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ sec += sec_stride;
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ src_pack = src_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *)(src));
+ MERGE_NEXT_SRC(src_reg, 1)
+
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height; i++) {
+ src += src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(xfilter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // merge previous pack to current pack source
+ MERGE_WITH_SRC(src_pack, src_reg)
+ // filter the source
+ FILTER_SRC(yfilter)
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ src_pack = src_reg;
+ sec += sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ }
+ }
+ }
+ CALC_SUM_AND_SSE
+ _mm256_zeroupper();
+ return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 000000000..d9563aa7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
+
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+ _mm_unpacklo_epi8( \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+ const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ // sum
+ __m128i vsum = _mm_add_epi16(diff0, diff1);
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsum =
+ _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, unsigned int *sse,
+ int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum =
+ (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+}
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride, int w,
+ int h, unsigned int *sse, int *sum,
+ getNxMvar_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ assert(sum <= 255 * 4 * 4);
+ assert(sum >= -255 * 4 * 4);
+ return *sse - ((sum * sum) >> 4);
+}
+
+unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
+ get4x4var_sse2, 4);
+ assert(sum <= 255 * 8 * 4);
+ assert(sum >= -255 * 8 * 4);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
+ get4x4var_sse2, 4);
+ assert(sum <= 255 * 8 * 4);
+ assert(sum >= -255 * 8 * 4);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ assert(sum <= 255 * 8 * 8);
+ assert(sum >= -255 * 8 * 8);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
+ aom_get8x8var_sse2, 8);
+ assert(sum <= 255 * 16 * 8);
+ assert(sum >= -255 * 16 * 8);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
+ aom_get8x8var_sse2, 8);
+ assert(sum <= 255 * 16 * 8);
+ assert(sum >= -255 * 16 * 8);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ assert(sum <= 255 * 16 * 16);
+ assert(sum >= -255 * 16 * 16);
+ return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
+}
+
+unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 32 * 32);
+ assert(sum >= -255 * 32 * 32);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 32 * 16);
+ assert(sum >= -255 * 32 * 16);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 32 * 16);
+ assert(sum >= -255 * 32 * 16);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 64 * 64);
+ assert(sum >= -255 * 64 * 64);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 64 * 32);
+ assert(sum >= -255 * 64 * 32);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 64 * 32);
+ assert(sum >= -255 * 64 * 32);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int aom_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ unsigned int sse; \
+ int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ h, &sse, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
+
+FNS(sse2, sse2);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+ const uint8_t *ref, int ref_stride) {
+ int i, j;
+ int stride = ref_stride << 3;
+
+ if (width >= 16) {
+ // read 16 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 16) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+ __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+ __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+ __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+ __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+ __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+ __m128i t0, t1, t2, t3;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ t1 = _mm_unpacklo_epi8(s2, s3);
+ s3 = _mm_unpackhi_epi8(s2, s3);
+ t2 = _mm_unpacklo_epi8(s4, s5);
+ s5 = _mm_unpackhi_epi8(s4, s5);
+ t3 = _mm_unpacklo_epi8(s6, s7);
+ s7 = _mm_unpackhi_epi8(s6, s7);
+
+ s0 = _mm_unpacklo_epi8(t0, s1);
+ s2 = _mm_unpacklo_epi8(t1, s3);
+ s4 = _mm_unpacklo_epi8(t2, s5);
+ s6 = _mm_unpacklo_epi8(t3, s7);
+ s0 = _mm_unpacklo_epi32(s0, s2);
+ s4 = _mm_unpacklo_epi32(s4, s6);
+ s0 = _mm_unpacklo_epi64(s0, s4);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), s0);
+ comp_pred += 16;
+ ref += 16 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else if (width >= 8) {
+ // read 8 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+ __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+ __m128i t0, t1;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ t1 = _mm_unpacklo_epi8(s2, s3);
+ s3 = _mm_unpackhi_epi8(s2, s3);
+
+ s0 = _mm_unpacklo_epi8(t0, s1);
+ s2 = _mm_unpacklo_epi8(t1, s3);
+ s0 = _mm_unpacklo_epi32(s0, s2);
+
+ _mm_storel_epi64((__m128i *)(comp_pred), s0);
+ comp_pred += 8;
+ ref += 8 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else {
+ // read 4 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i t0;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ s0 = _mm_unpacklo_epi8(t0, s1);
+
+ *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+ comp_pred += 4;
+ ref += 4 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ }
+}
+
+void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ int i, j;
+ int stride = ref_stride << 3;
+
+ if (width >= 16) {
+ // read 16 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 16) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+ __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+ __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+ __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+ __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+ __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i p1;
+ __m128i t0, t1, t2, t3;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ t1 = _mm_unpacklo_epi8(s2, s3);
+ s3 = _mm_unpackhi_epi8(s2, s3);
+ t2 = _mm_unpacklo_epi8(s4, s5);
+ s5 = _mm_unpackhi_epi8(s4, s5);
+ t3 = _mm_unpacklo_epi8(s6, s7);
+ s7 = _mm_unpackhi_epi8(s6, s7);
+
+ s0 = _mm_unpacklo_epi8(t0, s1);
+ s2 = _mm_unpacklo_epi8(t1, s3);
+ s4 = _mm_unpacklo_epi8(t2, s5);
+ s6 = _mm_unpacklo_epi8(t3, s7);
+
+ s0 = _mm_unpacklo_epi32(s0, s2);
+ s4 = _mm_unpacklo_epi32(s4, s6);
+ s0 = _mm_unpacklo_epi8(s0, zero);
+ s4 = _mm_unpacklo_epi8(s4, zero);
+
+ p1 = _mm_unpackhi_epi8(p0, zero);
+ p0 = _mm_unpacklo_epi8(p0, zero);
+ p0 = _mm_adds_epu16(s0, p0);
+ p1 = _mm_adds_epu16(s4, p1);
+ p0 = _mm_adds_epu16(p0, one);
+ p1 = _mm_adds_epu16(p1, one);
+
+ p0 = _mm_srli_epi16(p0, 1);
+ p1 = _mm_srli_epi16(p1, 1);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), p0);
+ comp_pred += 16;
+ pred += 16;
+ ref += 16 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else if (width >= 8) {
+ // read 8 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+ __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+ __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+ __m128i t0, t1;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ t1 = _mm_unpacklo_epi8(s2, s3);
+ s3 = _mm_unpackhi_epi8(s2, s3);
+
+ s0 = _mm_unpacklo_epi8(t0, s1);
+ s2 = _mm_unpacklo_epi8(t1, s3);
+ s0 = _mm_unpacklo_epi32(s0, s2);
+ s0 = _mm_unpacklo_epi8(s0, zero);
+
+ p0 = _mm_unpacklo_epi8(p0, zero);
+ p0 = _mm_adds_epu16(s0, p0);
+ p0 = _mm_adds_epu16(p0, one);
+ p0 = _mm_srli_epi16(p0, 1);
+ p0 = _mm_packus_epi16(p0, zero);
+
+ _mm_storel_epi64((__m128i *)(comp_pred), p0);
+ comp_pred += 8;
+ pred += 8;
+ ref += 8 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ } else {
+ // read 4 points at one time
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+ __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
+ __m128i t0;
+
+ t0 = _mm_unpacklo_epi8(s0, s1);
+ s1 = _mm_unpackhi_epi8(s0, s1);
+ s0 = _mm_unpacklo_epi8(t0, s1);
+ s0 = _mm_unpacklo_epi8(s0, zero);
+
+ p0 = _mm_unpacklo_epi8(p0, zero);
+ p0 = _mm_adds_epu16(s0, p0);
+ p0 = _mm_adds_epu16(p0, one);
+ p0 = _mm_srli_epi16(p0, 1);
+ p0 = _mm_packus_epi16(p0, zero);
+
+ *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+ comp_pred += 4;
+ pred += 4;
+ ref += 4 * 8;
+ }
+ ref += stride - (width << 3);
+ }
+ }
+}